From 74eb24da446ab508063a9ec72c16502d31cc68ff Mon Sep 17 00:00:00 2001 From: Cameron Gutman Date: Thu, 11 Jul 2024 20:22:57 -0500 Subject: [PATCH] Provide SSSE3, AVX2, and AVX512 optimized Reed-Solomon functions (#2828) * Provide SSSE3, AVX2, and AVX512 optimized Reed-Solomon functions * Update nanors to fix AVX-512 memory corruption --- cmake/compile_definitions/common.cmake | 4 +- cmake/targets/common.cmake | 4 +- src/main.cpp | 2 +- src/rswrapper.c | 157 +++++++++++++++++++++++++ src/rswrapper.h | 32 +++++ src/stream.cpp | 7 +- tests/unit/test_rswrapper.cpp | 37 ++++++ third-party/nanors | 2 +- 8 files changed, 236 insertions(+), 9 deletions(-) create mode 100644 src/rswrapper.c create mode 100644 src/rswrapper.h create mode 100644 tests/unit/test_rswrapper.cpp diff --git a/cmake/compile_definitions/common.cmake b/cmake/compile_definitions/common.cmake index e7436825cce..f4a91a78ad0 100644 --- a/cmake/compile_definitions/common.cmake +++ b/cmake/compile_definitions/common.cmake @@ -49,8 +49,6 @@ configure_file("${CMAKE_SOURCE_DIR}/src/version.h.in" version.h @ONLY) include_directories("${CMAKE_CURRENT_BINARY_DIR}") # required for importing version.h set(SUNSHINE_TARGET_FILES - "${CMAKE_SOURCE_DIR}/third-party/nanors/rs.c" - "${CMAKE_SOURCE_DIR}/third-party/nanors/rs.h" "${CMAKE_SOURCE_DIR}/third-party/moonlight-common-c/src/Input.h" "${CMAKE_SOURCE_DIR}/third-party/moonlight-common-c/src/Rtsp.h" "${CMAKE_SOURCE_DIR}/third-party/moonlight-common-c/src/RtspParser.c" @@ -108,6 +106,8 @@ set(SUNSHINE_TARGET_FILES "${CMAKE_SOURCE_DIR}/src/round_robin.h" "${CMAKE_SOURCE_DIR}/src/stat_trackers.h" "${CMAKE_SOURCE_DIR}/src/stat_trackers.cpp" + "${CMAKE_SOURCE_DIR}/src/rswrapper.h" + "${CMAKE_SOURCE_DIR}/src/rswrapper.c" ${PLATFORM_TARGET_FILES}) if(NOT SUNSHINE_ASSETS_DIR_DEF) diff --git a/cmake/targets/common.cmake b/cmake/targets/common.cmake index 8ceae30c856..d8e5ce3c515 100644 --- a/cmake/targets/common.cmake +++ b/cmake/targets/common.cmake @@ -85,9 +85,9 @@ set_source_files_properties("${CMAKE_SOURCE_DIR}/src/upnp.cpp" PROPERTIES COMPILE_FLAGS -Wno-pedantic) # third-party/nanors -set_source_files_properties("${CMAKE_SOURCE_DIR}/third-party/nanors/rs.c" +set_source_files_properties("${CMAKE_SOURCE_DIR}/src/rswrapper.c" DIRECTORY "${CMAKE_SOURCE_DIR}" "${TEST_DIR}" - PROPERTIES COMPILE_FLAGS "-include deps/obl/autoshim.h -ftree-vectorize") + PROPERTIES COMPILE_FLAGS "-ftree-vectorize -funroll-loops") # third-party/ViGEmClient set(VIGEM_COMPILE_FLAGS "") diff --git a/src/main.cpp b/src/main.cpp index f70cc637c61..ed584459e9b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -23,7 +23,7 @@ #include "video.h" extern "C" { -#include +#include "rswrapper.h" } using namespace std::literals; diff --git a/src/rswrapper.c b/src/rswrapper.c new file mode 100644 index 00000000000..b554bc29a2d --- /dev/null +++ b/src/rswrapper.c @@ -0,0 +1,157 @@ +/** + * @file src/rswrapper.c + * @brief Wrappers for nanors vectorization with different ISA options + */ + +// _FORTIY_SOURCE can cause some versions of GCC to try to inline +// memset() with incompatible target options when compiling rs.c +#ifdef _FORTIFY_SOURCE + #undef _FORTIFY_SOURCE +#endif + +// The assert() function is decorated with __cold on macOS which +// is incompatible with Clang's target multiversioning feature +#ifndef NDEBUG + #define NDEBUG +#endif + +#define DECORATE_FUNC_I(a, b) a##b +#define DECORATE_FUNC(a, b) DECORATE_FUNC_I(a, b) + +// Append an ISA suffix to the public RS API +#define reed_solomon_init DECORATE_FUNC(reed_solomon_init, ISA_SUFFIX) +#define reed_solomon_new DECORATE_FUNC(reed_solomon_new, ISA_SUFFIX) +#define reed_solomon_new_static DECORATE_FUNC(reed_solomon_new_static, ISA_SUFFIX) +#define reed_solomon_release DECORATE_FUNC(reed_solomon_release, ISA_SUFFIX) +#define reed_solomon_decode DECORATE_FUNC(reed_solomon_decode, ISA_SUFFIX) +#define reed_solomon_encode DECORATE_FUNC(reed_solomon_encode, ISA_SUFFIX) + +// Append an ISA suffix to internal functions to prevent multiple definition errors +#define obl_axpy_ref DECORATE_FUNC(obl_axpy_ref, ISA_SUFFIX) +#define obl_scal_ref DECORATE_FUNC(obl_scal_ref, ISA_SUFFIX) +#define obl_axpyb32_ref DECORATE_FUNC(obl_axpyb32_ref, ISA_SUFFIX) +#define obl_axpy DECORATE_FUNC(obl_axpy, ISA_SUFFIX) +#define obl_scal DECORATE_FUNC(obl_scal, ISA_SUFFIX) +#define obl_swap DECORATE_FUNC(obl_swap, ISA_SUFFIX) +#define obl_axpyb32 DECORATE_FUNC(obl_axpyb32, ISA_SUFFIX) +#define axpy DECORATE_FUNC(axpy, ISA_SUFFIX) +#define scal DECORATE_FUNC(scal, ISA_SUFFIX) +#define gemm DECORATE_FUNC(gemm, ISA_SUFFIX) +#define invert_mat DECORATE_FUNC(invert_mat, ISA_SUFFIX) + +#if defined(__x86_64__) || defined(__i386__) + + // Compile a variant for SSSE3 + #if defined(__clang__) + #pragma clang attribute push(__attribute__((target("ssse3"))), apply_to = function) + #else + #pragma GCC push_options + #pragma GCC target("ssse3") + #endif + #define ISA_SUFFIX _ssse3 + #define OBLAS_SSE3 + #include "../third-party/nanors/rs.c" + #undef OBLAS_SSE3 + #undef ISA_SUFFIX + #if defined(__clang__) + #pragma clang attribute pop + #else + #pragma GCC pop_options + #endif + + // Compile a variant for AVX2 + #if defined(__clang__) + #pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function) + #else + #pragma GCC push_options + #pragma GCC target("avx2") + #endif + #define ISA_SUFFIX _avx2 + #define OBLAS_AVX2 + #include "../third-party/nanors/rs.c" + #undef OBLAS_AVX2 + #undef ISA_SUFFIX + #if defined(__clang__) + #pragma clang attribute pop + #else + #pragma GCC pop_options + #endif + + // Compile a variant for AVX512BW + #if defined(__clang__) + #pragma clang attribute push(__attribute__((target("avx512f,avx512bw"))), apply_to = function) + #else + #pragma GCC push_options + #pragma GCC target("avx512f,avx512bw") + #endif + #define ISA_SUFFIX _avx512 + #define OBLAS_AVX512 + #include "../third-party/nanors/rs.c" + #undef OBLAS_AVX512 + #undef ISA_SUFFIX + #if defined(__clang__) + #pragma clang attribute pop + #else + #pragma GCC pop_options + #endif + +#endif + +// Compile a default variant +#define ISA_SUFFIX _def +#include "../third-party/nanors/deps/obl/autoshim.h" +#include "../third-party/nanors/rs.c" +#undef ISA_SUFFIX + +#undef reed_solomon_init +#undef reed_solomon_new +#undef reed_solomon_new_static +#undef reed_solomon_release +#undef reed_solomon_decode +#undef reed_solomon_encode + +#include "rswrapper.h" + +reed_solomon_new_t reed_solomon_new_fn; +reed_solomon_release_t reed_solomon_release_fn; +reed_solomon_encode_t reed_solomon_encode_fn; +reed_solomon_decode_t reed_solomon_decode_fn; + +/** + * @brief This initializes the RS function pointers to the best vectorized version available. + * @details The streaming code will directly invoke these function pointers during encoding. + */ +void +reed_solomon_init(void) { +#if defined(__x86_64__) || defined(__i386__) + if (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw")) { + reed_solomon_new_fn = reed_solomon_new_avx512; + reed_solomon_release_fn = reed_solomon_release_avx512; + reed_solomon_encode_fn = reed_solomon_encode_avx512; + reed_solomon_decode_fn = reed_solomon_decode_avx512; + reed_solomon_init_avx512(); + } + else if (__builtin_cpu_supports("avx2")) { + reed_solomon_new_fn = reed_solomon_new_avx2; + reed_solomon_release_fn = reed_solomon_release_avx2; + reed_solomon_encode_fn = reed_solomon_encode_avx2; + reed_solomon_decode_fn = reed_solomon_decode_avx2; + reed_solomon_init_avx2(); + } + else if (__builtin_cpu_supports("ssse3")) { + reed_solomon_new_fn = reed_solomon_new_ssse3; + reed_solomon_release_fn = reed_solomon_release_ssse3; + reed_solomon_encode_fn = reed_solomon_encode_ssse3; + reed_solomon_decode_fn = reed_solomon_decode_ssse3; + reed_solomon_init_ssse3(); + } + else +#endif + { + reed_solomon_new_fn = reed_solomon_new_def; + reed_solomon_release_fn = reed_solomon_release_def; + reed_solomon_encode_fn = reed_solomon_encode_def; + reed_solomon_decode_fn = reed_solomon_decode_def; + reed_solomon_init_def(); + } +} diff --git a/src/rswrapper.h b/src/rswrapper.h new file mode 100644 index 00000000000..d9a4c01dca5 --- /dev/null +++ b/src/rswrapper.h @@ -0,0 +1,32 @@ +/** + * @file src/rswrapper.h + * @brief Wrappers for nanors vectorization + * @details This is a drop-in replacement for nanors rs.h + */ +#pragma once + +#include + +typedef struct _reed_solomon reed_solomon; + +typedef reed_solomon *(*reed_solomon_new_t)(int data_shards, int parity_shards); +typedef void (*reed_solomon_release_t)(reed_solomon *rs); +typedef int (*reed_solomon_encode_t)(reed_solomon *rs, uint8_t **shards, int nr_shards, int bs); +typedef int (*reed_solomon_decode_t)(reed_solomon *rs, uint8_t **shards, uint8_t *marks, int nr_shards, int bs); + +extern reed_solomon_new_t reed_solomon_new_fn; +extern reed_solomon_release_t reed_solomon_release_fn; +extern reed_solomon_encode_t reed_solomon_encode_fn; +extern reed_solomon_decode_t reed_solomon_decode_fn; + +#define reed_solomon_new reed_solomon_new_fn +#define reed_solomon_release reed_solomon_release_fn +#define reed_solomon_encode reed_solomon_encode_fn +#define reed_solomon_decode reed_solomon_decode_fn + +/** + * @brief This initializes the RS function pointers to the best vectorized version available. + * @details The streaming code will directly invoke these function pointers during encoding. + */ +void +reed_solomon_init(void); diff --git a/src/stream.cpp b/src/stream.cpp index 46887c3a0aa..e7eb996865d 100644 --- a/src/stream.cpp +++ b/src/stream.cpp @@ -13,8 +13,10 @@ #include extern "C" { +// clang-format off #include -#include +#include "rswrapper.h" +// clang-format on } #include "config.h" @@ -236,7 +238,6 @@ namespace stream { } constexpr std::size_t MAX_AUDIO_PACKET_SIZE = 1400; - using rh_t = util::safe_ptr; using video_packet_t = util::c_ptr; using audio_packet_t = util::c_ptr; using audio_fec_packet_t = util::c_ptr; @@ -621,7 +622,7 @@ namespace stream { } namespace fec { - using rs_t = util::safe_ptr; + using rs_t = util::safe_ptr; struct fec_t { size_t data_shards; diff --git a/tests/unit/test_rswrapper.cpp b/tests/unit/test_rswrapper.cpp new file mode 100644 index 00000000000..a74a558bfad --- /dev/null +++ b/tests/unit/test_rswrapper.cpp @@ -0,0 +1,37 @@ +/** + * @file tests/unit/test_rswrapper.cpp + * @brief Test src/rswrapper.* + */ + +extern "C" { +#include +} + +#include + +TEST(ReedSolomonWrapperTests, InitTest) { + reed_solomon_init(); + + // Ensure all function pointers were populated + ASSERT_NE(reed_solomon_new, nullptr); + ASSERT_NE(reed_solomon_release, nullptr); + ASSERT_NE(reed_solomon_encode, nullptr); + ASSERT_NE(reed_solomon_decode, nullptr); +} + +TEST(ReedSolomonWrapperTests, EncodeTest) { + reed_solomon_init(); + + auto rs = reed_solomon_new(1, 1); + ASSERT_NE(rs, nullptr); + + uint8_t dataShard[16] = {}; + uint8_t fecShard[16] = {}; + + // If we picked the incorrect ISA in our wrapper, we should crash here + uint8_t *shardPtrs[2] = { dataShard, fecShard }; + auto ret = reed_solomon_encode(rs, shardPtrs, 2, sizeof(dataShard)); + ASSERT_EQ(ret, 0); + + reed_solomon_release(rs); +} diff --git a/third-party/nanors b/third-party/nanors index e9e242e98e2..19f07b513e9 160000 --- a/third-party/nanors +++ b/third-party/nanors @@ -1 +1 @@ -Subproject commit e9e242e98e27037830490b2a752895ca68f75f8b +Subproject commit 19f07b513e924e471cadd141943c1ec4adc8d0e0