Skip to content

Commit

Permalink
Provide SSSE3, AVX2, and AVX512 optimized Reed-Solomon functions (Liz…
Browse files Browse the repository at this point in the history
…ardByte#2828)

* Provide SSSE3, AVX2, and AVX512 optimized Reed-Solomon functions

* Update nanors to fix AVX-512 memory corruption
  • Loading branch information
cgutman authored and KuleRucket committed Oct 9, 2024
1 parent b19d6ab commit 74eb24d
Show file tree
Hide file tree
Showing 8 changed files with 236 additions and 9 deletions.
4 changes: 2 additions & 2 deletions cmake/compile_definitions/common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@ configure_file("${CMAKE_SOURCE_DIR}/src/version.h.in" version.h @ONLY)
include_directories("${CMAKE_CURRENT_BINARY_DIR}") # required for importing version.h

set(SUNSHINE_TARGET_FILES
"${CMAKE_SOURCE_DIR}/third-party/nanors/rs.c"
"${CMAKE_SOURCE_DIR}/third-party/nanors/rs.h"
"${CMAKE_SOURCE_DIR}/third-party/moonlight-common-c/src/Input.h"
"${CMAKE_SOURCE_DIR}/third-party/moonlight-common-c/src/Rtsp.h"
"${CMAKE_SOURCE_DIR}/third-party/moonlight-common-c/src/RtspParser.c"
Expand Down Expand Up @@ -108,6 +106,8 @@ set(SUNSHINE_TARGET_FILES
"${CMAKE_SOURCE_DIR}/src/round_robin.h"
"${CMAKE_SOURCE_DIR}/src/stat_trackers.h"
"${CMAKE_SOURCE_DIR}/src/stat_trackers.cpp"
"${CMAKE_SOURCE_DIR}/src/rswrapper.h"
"${CMAKE_SOURCE_DIR}/src/rswrapper.c"
${PLATFORM_TARGET_FILES})

if(NOT SUNSHINE_ASSETS_DIR_DEF)
Expand Down
4 changes: 2 additions & 2 deletions cmake/targets/common.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,9 @@ set_source_files_properties("${CMAKE_SOURCE_DIR}/src/upnp.cpp"
PROPERTIES COMPILE_FLAGS -Wno-pedantic)

# third-party/nanors
set_source_files_properties("${CMAKE_SOURCE_DIR}/third-party/nanors/rs.c"
set_source_files_properties("${CMAKE_SOURCE_DIR}/src/rswrapper.c"
DIRECTORY "${CMAKE_SOURCE_DIR}" "${TEST_DIR}"
PROPERTIES COMPILE_FLAGS "-include deps/obl/autoshim.h -ftree-vectorize")
PROPERTIES COMPILE_FLAGS "-ftree-vectorize -funroll-loops")

# third-party/ViGEmClient
set(VIGEM_COMPILE_FLAGS "")
Expand Down
2 changes: 1 addition & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#include "video.h"

extern "C" {
#include <rs.h>
#include "rswrapper.h"
}

using namespace std::literals;
Expand Down
157 changes: 157 additions & 0 deletions src/rswrapper.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
/**
* @file src/rswrapper.c
* @brief Wrappers for nanors vectorization with different ISA options
*/

// _FORTIY_SOURCE can cause some versions of GCC to try to inline
// memset() with incompatible target options when compiling rs.c
#ifdef _FORTIFY_SOURCE
#undef _FORTIFY_SOURCE
#endif

// The assert() function is decorated with __cold on macOS which
// is incompatible with Clang's target multiversioning feature
#ifndef NDEBUG
#define NDEBUG
#endif

#define DECORATE_FUNC_I(a, b) a##b
#define DECORATE_FUNC(a, b) DECORATE_FUNC_I(a, b)

// Append an ISA suffix to the public RS API
#define reed_solomon_init DECORATE_FUNC(reed_solomon_init, ISA_SUFFIX)
#define reed_solomon_new DECORATE_FUNC(reed_solomon_new, ISA_SUFFIX)
#define reed_solomon_new_static DECORATE_FUNC(reed_solomon_new_static, ISA_SUFFIX)
#define reed_solomon_release DECORATE_FUNC(reed_solomon_release, ISA_SUFFIX)
#define reed_solomon_decode DECORATE_FUNC(reed_solomon_decode, ISA_SUFFIX)
#define reed_solomon_encode DECORATE_FUNC(reed_solomon_encode, ISA_SUFFIX)

// Append an ISA suffix to internal functions to prevent multiple definition errors
#define obl_axpy_ref DECORATE_FUNC(obl_axpy_ref, ISA_SUFFIX)
#define obl_scal_ref DECORATE_FUNC(obl_scal_ref, ISA_SUFFIX)
#define obl_axpyb32_ref DECORATE_FUNC(obl_axpyb32_ref, ISA_SUFFIX)
#define obl_axpy DECORATE_FUNC(obl_axpy, ISA_SUFFIX)
#define obl_scal DECORATE_FUNC(obl_scal, ISA_SUFFIX)
#define obl_swap DECORATE_FUNC(obl_swap, ISA_SUFFIX)
#define obl_axpyb32 DECORATE_FUNC(obl_axpyb32, ISA_SUFFIX)
#define axpy DECORATE_FUNC(axpy, ISA_SUFFIX)
#define scal DECORATE_FUNC(scal, ISA_SUFFIX)
#define gemm DECORATE_FUNC(gemm, ISA_SUFFIX)
#define invert_mat DECORATE_FUNC(invert_mat, ISA_SUFFIX)

#if defined(__x86_64__) || defined(__i386__)

// Compile a variant for SSSE3
#if defined(__clang__)
#pragma clang attribute push(__attribute__((target("ssse3"))), apply_to = function)
#else
#pragma GCC push_options
#pragma GCC target("ssse3")
#endif
#define ISA_SUFFIX _ssse3
#define OBLAS_SSE3
#include "../third-party/nanors/rs.c"
#undef OBLAS_SSE3
#undef ISA_SUFFIX
#if defined(__clang__)
#pragma clang attribute pop
#else
#pragma GCC pop_options
#endif

// Compile a variant for AVX2
#if defined(__clang__)
#pragma clang attribute push(__attribute__((target("avx2"))), apply_to = function)
#else
#pragma GCC push_options
#pragma GCC target("avx2")
#endif
#define ISA_SUFFIX _avx2
#define OBLAS_AVX2
#include "../third-party/nanors/rs.c"
#undef OBLAS_AVX2
#undef ISA_SUFFIX
#if defined(__clang__)
#pragma clang attribute pop
#else
#pragma GCC pop_options
#endif

// Compile a variant for AVX512BW
#if defined(__clang__)
#pragma clang attribute push(__attribute__((target("avx512f,avx512bw"))), apply_to = function)
#else
#pragma GCC push_options
#pragma GCC target("avx512f,avx512bw")
#endif
#define ISA_SUFFIX _avx512
#define OBLAS_AVX512
#include "../third-party/nanors/rs.c"
#undef OBLAS_AVX512
#undef ISA_SUFFIX
#if defined(__clang__)
#pragma clang attribute pop
#else
#pragma GCC pop_options
#endif

#endif

// Compile a default variant
#define ISA_SUFFIX _def
#include "../third-party/nanors/deps/obl/autoshim.h"
#include "../third-party/nanors/rs.c"
#undef ISA_SUFFIX

#undef reed_solomon_init
#undef reed_solomon_new
#undef reed_solomon_new_static
#undef reed_solomon_release
#undef reed_solomon_decode
#undef reed_solomon_encode

#include "rswrapper.h"

reed_solomon_new_t reed_solomon_new_fn;
reed_solomon_release_t reed_solomon_release_fn;
reed_solomon_encode_t reed_solomon_encode_fn;
reed_solomon_decode_t reed_solomon_decode_fn;

/**
* @brief This initializes the RS function pointers to the best vectorized version available.
* @details The streaming code will directly invoke these function pointers during encoding.
*/
void
reed_solomon_init(void) {
#if defined(__x86_64__) || defined(__i386__)
if (__builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512bw")) {
reed_solomon_new_fn = reed_solomon_new_avx512;
reed_solomon_release_fn = reed_solomon_release_avx512;
reed_solomon_encode_fn = reed_solomon_encode_avx512;
reed_solomon_decode_fn = reed_solomon_decode_avx512;
reed_solomon_init_avx512();
}
else if (__builtin_cpu_supports("avx2")) {
reed_solomon_new_fn = reed_solomon_new_avx2;
reed_solomon_release_fn = reed_solomon_release_avx2;
reed_solomon_encode_fn = reed_solomon_encode_avx2;
reed_solomon_decode_fn = reed_solomon_decode_avx2;
reed_solomon_init_avx2();
}
else if (__builtin_cpu_supports("ssse3")) {
reed_solomon_new_fn = reed_solomon_new_ssse3;
reed_solomon_release_fn = reed_solomon_release_ssse3;
reed_solomon_encode_fn = reed_solomon_encode_ssse3;
reed_solomon_decode_fn = reed_solomon_decode_ssse3;
reed_solomon_init_ssse3();
}
else
#endif
{
reed_solomon_new_fn = reed_solomon_new_def;
reed_solomon_release_fn = reed_solomon_release_def;
reed_solomon_encode_fn = reed_solomon_encode_def;
reed_solomon_decode_fn = reed_solomon_decode_def;
reed_solomon_init_def();
}
}
32 changes: 32 additions & 0 deletions src/rswrapper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/**
* @file src/rswrapper.h
* @brief Wrappers for nanors vectorization
* @details This is a drop-in replacement for nanors rs.h
*/
#pragma once

#include <stdint.h>

typedef struct _reed_solomon reed_solomon;

typedef reed_solomon *(*reed_solomon_new_t)(int data_shards, int parity_shards);
typedef void (*reed_solomon_release_t)(reed_solomon *rs);
typedef int (*reed_solomon_encode_t)(reed_solomon *rs, uint8_t **shards, int nr_shards, int bs);
typedef int (*reed_solomon_decode_t)(reed_solomon *rs, uint8_t **shards, uint8_t *marks, int nr_shards, int bs);

extern reed_solomon_new_t reed_solomon_new_fn;
extern reed_solomon_release_t reed_solomon_release_fn;
extern reed_solomon_encode_t reed_solomon_encode_fn;
extern reed_solomon_decode_t reed_solomon_decode_fn;

#define reed_solomon_new reed_solomon_new_fn
#define reed_solomon_release reed_solomon_release_fn
#define reed_solomon_encode reed_solomon_encode_fn
#define reed_solomon_decode reed_solomon_decode_fn

/**
* @brief This initializes the RS function pointers to the best vectorized version available.
* @details The streaming code will directly invoke these function pointers during encoding.
*/
void
reed_solomon_init(void);
7 changes: 4 additions & 3 deletions src/stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
#include <boost/endian/arithmetic.hpp>

extern "C" {
// clang-format off
#include <moonlight-common-c/src/Limelight-internal.h>
#include <rs.h>
#include "rswrapper.h"
// clang-format on
}

#include "config.h"
Expand Down Expand Up @@ -236,7 +238,6 @@ namespace stream {
}
constexpr std::size_t MAX_AUDIO_PACKET_SIZE = 1400;

using rh_t = util::safe_ptr<reed_solomon, reed_solomon_release>;
using video_packet_t = util::c_ptr<video_packet_raw_t>;
using audio_packet_t = util::c_ptr<audio_packet_raw_t>;
using audio_fec_packet_t = util::c_ptr<audio_fec_packet_raw_t>;
Expand Down Expand Up @@ -621,7 +622,7 @@ namespace stream {
}

namespace fec {
using rs_t = util::safe_ptr<reed_solomon, reed_solomon_release>;
using rs_t = util::safe_ptr<reed_solomon, [](reed_solomon *rs) { reed_solomon_release(rs); }>;

struct fec_t {
size_t data_shards;
Expand Down
37 changes: 37 additions & 0 deletions tests/unit/test_rswrapper.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/**
* @file tests/unit/test_rswrapper.cpp
* @brief Test src/rswrapper.*
*/

extern "C" {
#include <src/rswrapper.h>
}

#include <tests/conftest.cpp>

TEST(ReedSolomonWrapperTests, InitTest) {
reed_solomon_init();

// Ensure all function pointers were populated
ASSERT_NE(reed_solomon_new, nullptr);
ASSERT_NE(reed_solomon_release, nullptr);
ASSERT_NE(reed_solomon_encode, nullptr);
ASSERT_NE(reed_solomon_decode, nullptr);
}

TEST(ReedSolomonWrapperTests, EncodeTest) {
reed_solomon_init();

auto rs = reed_solomon_new(1, 1);
ASSERT_NE(rs, nullptr);

uint8_t dataShard[16] = {};
uint8_t fecShard[16] = {};

// If we picked the incorrect ISA in our wrapper, we should crash here
uint8_t *shardPtrs[2] = { dataShard, fecShard };
auto ret = reed_solomon_encode(rs, shardPtrs, 2, sizeof(dataShard));
ASSERT_EQ(ret, 0);

reed_solomon_release(rs);
}
2 changes: 1 addition & 1 deletion third-party/nanors
Submodule nanors updated 1 files
+1 −1 deps/obl/oblas_lite.c

0 comments on commit 74eb24d

Please sign in to comment.