From 68666db994d57fb118b7cd9b24ce29db2d257fb4 Mon Sep 17 00:00:00 2001 From: Larry Gritz Date: Sat, 16 Mar 2024 09:00:42 -0700 Subject: [PATCH] refactor(simd.h): use default for ctrs and assignment where applicable (#4187) simd.h improvements and refactorization for consistency: * Use `= default` for all the uninitialized constructors, copy constructors, and assignment from the same type. * Fix the `operator=` to be more standard by returning a `vec&` instead of `const vec&`, which apparently is unusual. * Test that these all result in C++ understanding that these classes are trivially copyable. There is a bit of awkwardness where for the vint types, we have to define the copy constructors for the Intel icc compiler. It just seems to do the wrong thing for `=default` and fails lots of our tests. I don't understand why, and I'm not inclined to investigate too deeply, since it's a discontinued compiler we support only for legacy reasons, so it doesn't bother me too much that we special-case it. One reason I think it's a compiler bug is that it has no problem with the floating point vector types. It's confused about something with the simd ints. Disable SIMD acceleration of fast_exp2 when using MSVS prior to 2022. It seems to sporadically generate bad code in this function and I can't seem to restructure the function in a way that consistently is correct. Doesn't seem to happen with newer MSVS, nor on any other platform or compiler. I'm assuming this is a compiler bug and it's not worth pursuing for an old version of one compiler, for this one function that isn't really used all that often or in performance-critical paths. Some extra slosh in the simd fast_exp test. Rewrite OIIO_CHECK_SIMD_EQUAL_THRESH to print the differences and use more digits of precision when it fails. --------- Signed-off-by: Larry Gritz --- .github/workflows/ci.yml | 6 +- src/cmake/compiler.cmake | 2 +- src/include/OpenImageIO/fmath.h | 2 +- src/include/OpenImageIO/simd.h | 203 +++++++++++++---------------- src/include/OpenImageIO/unittest.h | 21 +-- src/libutil/simd_test.cpp | 24 +++- 6 files changed, 132 insertions(+), 126 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 034fa373b4..3c8585c695 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -547,15 +547,15 @@ jobs: generator: "Visual Studio 16 2019" openexr_ver: v3.2.2 python_ver: 3.7 - simd: sse4.2 + # simd: sse4.2 # - desc: windows-2022 # runner: windows-2022 # vsver: 2022 # generator: "Visual Studio 17 2022" - # openexr_ver: main + # openexr_ver: v3.2.2 # # v3.1.4 # python_ver: 3.7 - # simd: sse4.2 + # # simd: AVX2 runs-on: ${{ matrix.runner }} env: PYTHON_VERSION: ${{matrix.python_ver}} diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake index e8f3061678..bb2ee0f445 100644 --- a/src/cmake/compiler.cmake +++ b/src/cmake/compiler.cmake @@ -300,8 +300,8 @@ endif () # set (USE_SIMD "" CACHE STRING "Use SIMD directives (0, sse2, sse3, ssse3, sse4.1, sse4.2, avx, avx2, avx512f, f16c, aes)") set (SIMD_COMPILE_FLAGS "") +message (STATUS "Compiling with SIMD level ${USE_SIMD}") if (NOT USE_SIMD STREQUAL "") - message (STATUS "Compiling with SIMD level ${USE_SIMD}") if (USE_SIMD STREQUAL "0") set (SIMD_COMPILE_FLAGS ${SIMD_COMPILE_FLAGS} "-DOIIO_NO_SIMD=1") else () diff --git a/src/include/OpenImageIO/fmath.h b/src/include/OpenImageIO/fmath.h index eb276480b9..e0467a4b55 100644 --- a/src/include/OpenImageIO/fmath.h +++ b/src/include/OpenImageIO/fmath.h @@ -1852,7 +1852,7 @@ template OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp2 (const T& xval) { using namespace simd; typedef typename T::vint_t intN; -#if OIIO_SIMD_SSE +#if OIIO_SIMD_SSE && !OIIO_MSVS_BEFORE_2022 // See float specialization for explanations T x = clamp (xval, T(-126.0f), T(126.0f)); intN m (x); x -= T(m); diff --git a/src/include/OpenImageIO/simd.h b/src/include/OpenImageIO/simd.h index 7aaf578ba5..c80247b929 100644 --- a/src/include/OpenImageIO/simd.h +++ b/src/include/OpenImageIO/simd.h @@ -531,7 +531,7 @@ class vbool4 { static constexpr size_t size() noexcept { return elements; } /// Default constructor (contents undefined) - vbool4 () { } + vbool4() = default; /// Construct from a single value (store it in all slots) vbool4 (bool a) { load(a); } @@ -542,7 +542,7 @@ class vbool4 { vbool4 (bool a, bool b, bool c, bool d) { load (a, b, c, d); } /// Copy construct from another vbool4 - vbool4 (const vbool4 &other) { m_simd = other.m_simd; } + vbool4(const vbool4 &other) = default; /// Construct from 4 int values vbool4 (int a, int b, int c, int d) { @@ -576,10 +576,10 @@ class vbool4 { static const vbool4 True (); /// Assign one value to all components - const vbool4 & operator= (bool a) { load(a); return *this; } + vbool4& operator=(bool a) { load(a); return *this; } /// Assignment of another vbool4 - const vbool4 & operator= (const vbool4 & other); + vbool4& operator=(const vbool4& other) = default; /// Component access (get) int operator[] (int i) const; @@ -673,7 +673,7 @@ class vbool8 { static constexpr size_t size() noexcept { return elements; } /// Default constructor (contents undefined) - vbool8 () { } + vbool8() = default; /// Construct from a single value (store it in all slots) vbool8 (bool a) { load (a); } @@ -684,7 +684,7 @@ class vbool8 { vbool8 (bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h); /// Copy construct from another vbool8 - vbool8 (const vbool8 &other) { m_simd = other.m_simd; } + vbool8(const vbool8 &other) = default; /// Construct from 8 int values vbool8 (int a, int b, int c, int d, int e, int f, int g, int h); @@ -719,10 +719,10 @@ class vbool8 { static const vbool8 True (); /// Assign one value to all components - const vbool8 & operator= (bool a); + vbool8& operator=(bool a) { load(a); return *this; } /// Assignment of another vbool8 - const vbool8 & operator= (const vbool8 & other); + vbool8& operator=(const vbool8& other) = default; /// Component access (get) int operator[] (int i) const; @@ -822,7 +822,7 @@ class vbool16 { static constexpr size_t size() noexcept { return elements; } /// Default constructor (contents undefined) - vbool16 () { } + vbool16() = default; /// Construct from a single value (store it in all slots) vbool16 (bool a) { load (a); } @@ -836,7 +836,7 @@ class vbool16 { bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15); /// Copy construct from another vbool16 - vbool16 (const vbool16 &other) { m_simd = other.m_simd; } + vbool16(const vbool16 &other) = default; /// Construct from 16 int values vbool16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7, @@ -874,10 +874,10 @@ class vbool16 { static const vbool16 True (); /// Assign one value to all components - const vbool16 & operator= (bool a); + vbool16& operator=(bool a) { load(a); return *this; } /// Assignment of another vbool16 - const vbool16 & operator= (const vbool16 & other); + vbool16& operator=(const vbool16& other) = default; /// Component access (get) int operator[] (int i) const; @@ -971,7 +971,7 @@ class vint4 { static constexpr size_t size() noexcept { return elements; } /// Default constructor (contents undefined) - vint4 () { } + vint4() = default; /// Construct from a single value (store it in all slots) vint4 (int a); @@ -998,7 +998,7 @@ class vint4 { explicit vint4 (const char *vals); /// Copy construct from another vint4 - vint4 (const vint4 & other) { m_simd = other.m_simd; } + vint4(const vint4& other) = default; /// Convert a vfloat to an vint. Equivalent to i = (int)f; explicit vint4 (const vfloat4& f); // implementation below @@ -1035,10 +1035,15 @@ class vint4 { static const vint4 Giota (); /// Assign one value to all components. - const vint4 & operator= (int a); + vint4& operator=(int a) { load(a); return *this; } /// Assignment from another vint4 - const vint4 & operator= (const vint4& other) ; +#if !defined(__INTEL_COMPILER) + vint4& operator=(const vint4& other) = default; +#else + // For explanation of the necessity of this, see implementation comment. + vint4& operator=(const vint4& other); +#endif /// Component access (get) int operator[] (int i) const; @@ -1262,7 +1267,7 @@ class vint8 { static constexpr size_t size() noexcept { return elements; } /// Default constructor (contents undefined) - vint8 () { } + vint8() = default; /// Construct from a single value (store it in all slots) vint8 (int a); @@ -1289,7 +1294,7 @@ class vint8 { explicit vint8 (const char *vals); /// Copy construct from another vint8 - vint8 (const vint8 & other) { m_simd = other.m_simd; } + vint8(const vint8& other) = default; /// Convert a vfloat8 to an vint8. Equivalent to i = (int)f; explicit vint8 (const vfloat8& f); // implementation below @@ -1329,10 +1334,15 @@ class vint8 { static const vint8 Giota (); /// Assign one value to all components. - const vint8 & operator= (int a); + vint8& operator=(int a) { load(a); return *this; } /// Assignment from another vint8 - const vint8 & operator= (const vint8& other) ; +#if !defined(__INTEL_COMPILER) + vint8& operator=(const vint8& other) = default; +#else + // For explanation of the necessity of this, see implementation comment. + vint8& operator=(const vint8& other); +#endif /// Component access (get) int operator[] (int i) const; @@ -1561,7 +1571,7 @@ class vint16 { static constexpr size_t size() noexcept { return elements; } /// Default constructor (contents undefined) - vint16 () { } + vint16() = default; /// Construct from a single value (store it in all slots) vint16 (int a); @@ -1586,7 +1596,7 @@ class vint16 { explicit vint16 (const char *vals); /// Copy construct from another vint16 - vint16 (const vint16 & other) { m_simd = other.m_simd; } + vint16(const vint16& other) = default; /// Convert a vfloat16 to an vint16. Equivalent to i = (int)f; explicit vint16 (const vfloat16& f); // implementation below @@ -1629,10 +1639,15 @@ class vint16 { static const vint16 Giota (); /// Assign one value to all components. - const vint16 & operator= (int a); + vint16& operator=(int a) { load(a); return *this; } /// Assignment from another vint16 - const vint16 & operator= (const vint16& other) ; +#if !defined(__INTEL_COMPILER) + vint16& operator=(const vint16& other) = default; +#else + // For explanation of the necessity of this, see implementation comment. + vint16& operator=(const vint16& other); +#endif /// Component access (get) int operator[] (int i) const; @@ -1873,7 +1888,7 @@ class vfloat4 { static constexpr size_t size() noexcept { return elements; } /// Default constructor (contents undefined) - vfloat4 () { } + vfloat4() = default; /// Construct from a single value (store it in all slots) vfloat4 (float a) { load(a); } @@ -1885,7 +1900,7 @@ class vfloat4 { vfloat4 (const float *f) { load (f); } /// Copy construct from another vfloat4 - vfloat4 (const vfloat4 &other) { m_simd = other.m_simd; } + vfloat4(const vfloat4 &other) = default; /// Construct from an vint4 (promoting all components to float) explicit vfloat4 (const vint4& ival); @@ -1940,13 +1955,10 @@ class vfloat4 { #endif /// Assign a single value to all components - const vfloat4 & operator= (float a) { load(a); return *this; } + vfloat4& operator=(float a) { load(a); return *this; } /// Assign a vfloat4 - const vfloat4 & operator= (vfloat4 other) { - m_simd = other.m_simd; - return *this; - } + vfloat4& operator=(const vfloat4& other) = default; /// Return a vfloat4 with all components set to 0.0 static const vfloat4 Zero (); @@ -2254,7 +2266,7 @@ class vfloat3 : public vfloat4 { static constexpr size_t size() noexcept { return elements; } /// Default constructor (contents undefined) - vfloat3 () { } + vfloat3() = default; /// Construct from a single value (store it in all slots) vfloat3 (float a) { load(a); } @@ -2278,7 +2290,7 @@ class vfloat3 : public vfloat4 { vfloat3(const V& v) : vfloat3(v[0], v[1], v[2]) { } /// Copy construct from another vfloat3 - vfloat3 (const vfloat3 &other); + vfloat3(const vfloat3& other) = default; /// Construct from a vfloat4. Note: it will not zero out the internal /// 4th component, but rather accept on faith that the vfloat4 you are @@ -2315,7 +2327,7 @@ class vfloat3 : public vfloat4 { #endif /// Assign a single value to all components - const vfloat3 & operator= (float a) { load(a); return *this; } + vfloat3& operator=(float a) { load(a); return *this; } /// Return a vfloat3 with all components set to 0.0 static const vfloat3 Zero (); @@ -2421,15 +2433,10 @@ class matrix44 { static constexpr int elements = 16; // Uninitialized - OIIO_FORCEINLINE matrix44() { } + OIIO_FORCEINLINE matrix44() = default; /// Copy constructor - OIIO_FORCEINLINE matrix44 (const matrix44 &M) { - m_row[0] = M[0]; - m_row[1] = M[1]; - m_row[2] = M[2]; - m_row[3] = M[3]; - } + OIIO_FORCEINLINE matrix44(const matrix44 &M) = default; /// Construct from a float array OIIO_FORCEINLINE explicit matrix44 (const float *f) { @@ -2480,7 +2487,7 @@ class matrix44 { const vfloat4& operator[] (int i) const; /// Assignment - const matrix44& operator= (const matrix44& m); + matrix44& operator=(const matrix44& m) = default; /// Return the transposed matrix matrix44 transposed () const; @@ -2556,7 +2563,7 @@ class vfloat8 { static constexpr size_t size() noexcept { return elements; } /// Default constructor (contents undefined) - vfloat8 () { } + vfloat8() = default; /// Construct from a single value (store it in all slots) vfloat8 (float a) { load(a); } @@ -2569,7 +2576,7 @@ class vfloat8 { vfloat8 (const float *f) { load (f); } /// Copy construct from another vfloat8 - vfloat8 (const vfloat8 &other) { m_simd = other.m_simd; } + vfloat8(const vfloat8 &other) = default; /// Construct from an int vector (promoting all components to float) explicit vfloat8 (const vint8& ival); @@ -2607,13 +2614,10 @@ class vfloat8 { #endif /// Assign a single value to all components - const vfloat8& operator= (float a) { load(a); return *this; } + vfloat8& operator=(float a) { load(a); return *this; } /// Assign a vfloat8 - const vfloat8& operator= (vfloat8 other) { - m_simd = other.m_simd; - return *this; - } + vfloat8& operator=(const vfloat8& other) = default; /// Return a vfloat8 with all components set to 0.0 static const vfloat8 Zero (); @@ -2877,7 +2881,7 @@ class vfloat16 { static constexpr size_t size() noexcept { return elements; } /// Default constructor (contents undefined) - vfloat16 () { } + vfloat16() = default; /// Construct from a single value (store it in all slots) vfloat16 (float a) { load(a); } @@ -2892,7 +2896,7 @@ class vfloat16 { vfloat16 (const float *f) { load (f); } /// Copy construct from another vfloat16 - vfloat16 (const vfloat16 &other) { m_simd = other.m_simd; } + vfloat16(const vfloat16 &other) = default; /// Construct from an int vector (promoting all components to float) explicit vfloat16 (const vint16& ival); @@ -2933,13 +2937,10 @@ class vfloat16 { #endif /// Assign a single value to all components - const vfloat16& operator= (float a) { load(a); return *this; } + vfloat16& operator=(float a) { load(a); return *this; } /// Assign a vfloat16 - const vfloat16& operator= (vfloat16 other) { - m_simd = other.m_simd; - return *this; - } + vfloat16& operator=(const vfloat16& other) = default; /// Return a vfloat16 with all components set to 0.0 static const vfloat16 Zero (); @@ -3314,11 +3315,6 @@ OIIO_FORCEINLINE vbool4::vbool4 (const bool *a) { load (a[0], a[1], a[2], a[3]); } -OIIO_FORCEINLINE const vbool4& vbool4::operator= (const vbool4 & other) { - m_simd = other.m_simd; - return *this; -} - OIIO_FORCEINLINE int vbool4::bitmask () const { #if OIIO_SIMD_SSE @@ -3683,16 +3679,6 @@ OIIO_FORCEINLINE vbool8::vbool8 (const bool *a) { } -OIIO_FORCEINLINE const vbool8& vbool8::operator= (bool a) { - load(a); - return *this; -} - -OIIO_FORCEINLINE const vbool8& vbool8::operator= (const vbool8 & other) { - m_simd = other.m_simd; - return *this; -} - OIIO_FORCEINLINE int vbool8::bitmask () const { #if OIIO_SIMD_AVX return _mm256_movemask_ps(m_simd); @@ -4003,17 +3989,6 @@ OIIO_FORCEINLINE vbool16::vbool16 (const bool *a) { } -OIIO_FORCEINLINE const vbool16& vbool16::operator= (bool a) { - load(a); - return *this; -} - -OIIO_FORCEINLINE const vbool16& vbool16::operator= (const vbool16 & other) { - m_simd = other.m_simd; - return *this; -} - - OIIO_FORCEINLINE int vbool16::bitmask () const { return static_cast(m_simd); } @@ -4164,10 +4139,17 @@ OIIO_FORCEINLINE bool none (const vbool16& v) { return reduce_or(v) == false; } ////////////////////////////////////////////////////////////////////// // vint4 implementation -OIIO_FORCEINLINE const vint4 & vint4::operator= (const vint4& other) { +#if defined(__INTEL_COMPILER) +// For reasons we don't understand, all sorts of failures crop up only on icc +// if we make this =default. Although we still support icc for now, it's a +// discontinued compiler, so we special-case it here rather than spend a lot +// of time investigating what might be broken (and would of course never be +// fixed if it's a compiler bug). +OIIO_FORCEINLINE vint4& vint4::operator=(const vint4& other) { m_simd = other.m_simd; return *this; } +#endif OIIO_FORCEINLINE int vint4::operator[] (int i) const { OIIO_DASSERT(i= 512 @@ -8069,14 +8059,6 @@ OIIO_FORCEINLINE vint4 AxBxCxDx (const vint4& a, const vint4& b, ////////////////////////////////////////////////////////////////////// // vfloat3 implementation -OIIO_FORCEINLINE vfloat3::vfloat3 (const vfloat3 &other) : vfloat4(other) { -#if OIIO_SIMD_SSE || OIIO_SIMD_NEON - m_simd = other.m_simd; -#else - SIMD_CONSTRUCT_PAD (other[i]); -#endif -} - OIIO_FORCEINLINE vfloat3::vfloat3 (const vfloat4 &other) { #if OIIO_SIMD_SSE || OIIO_SIMD_NEON m_simd = other.simd(); @@ -8340,16 +8322,6 @@ OIIO_FORCEINLINE const vfloat4& matrix44::operator[] (int i) const { } -OIIO_FORCEINLINE const matrix44& matrix44::operator= (const matrix44& m) -{ - m_row[0] = m[0]; - m_row[1] = m[1]; - m_row[2] = m[2]; - m_row[3] = m[3]; - return *this; -} - - OIIO_FORCEINLINE matrix44 matrix44::transposed () const { #if OIIO_SIMD_SSE matrix44 T; @@ -10360,6 +10332,19 @@ template<> struct formatter : OIIO::pvt::array_formatter {}; } // namespace fmt + +// Allow C++ metaprogramming to understand that the simd types are trivially +// copyable (i.e. memcpy to copy simd types is fine). +namespace std { // not necessary in C++17, just say std::is_trivially_copyable +#if defined(__INTEL_COMPILER) +// Necessary because we have to define the vint types copy constructors on icc +template<> struct is_trivially_copyable : std::true_type {}; +template<> struct is_trivially_copyable : std::true_type {}; +template<> struct is_trivially_copyable : std::true_type {}; +#endif +} // namespace std + + #undef SIMD_DO #undef SIMD_CONSTRUCT #undef SIMD_CONSTRUCT_PAD diff --git a/src/include/OpenImageIO/unittest.h b/src/include/OpenImageIO/unittest.h index 6ffcc45d34..a605cc5dfd 100644 --- a/src/include/OpenImageIO/unittest.h +++ b/src/include/OpenImageIO/unittest.h @@ -231,16 +231,17 @@ static OIIO::pvt::UnitTestFailureCounter unit_test_failures; << "'\n"), \ (void)++unit_test_failures)) -#define OIIO_CHECK_SIMD_EQUAL_THRESH(x, y, eps) \ - (all(abs((x) - (y)) <= (eps)) \ - ? ((void)0) \ - : ((std::cout << OIIO::Sysutil::Term(std::cout).ansi("red,bold") \ - << __FILE__ << ":" << __LINE__ << ":\n" \ - << "FAILED: " \ - << OIIO::Sysutil::Term(std::cout).ansi("normal") << #x \ - << " == " << #y << "\n" \ - << "\tvalues were '" << (x) << "' and '" << (y) \ - << "'\n"), \ +#define OIIO_CHECK_SIMD_EQUAL_THRESH(x, y, eps) \ + ((all(abs((x) - (y)) <= eps)) \ + ? ((void)0) \ + : (Strutil::print("{}{}:{}:\nFAILED: {}{} == {}\n", \ + OIIO::Sysutil::Term(std::cout).ansi("red,bold"), \ + __FILE__, __LINE__, \ + OIIO::Sysutil::Term(std::cout).ansi("normal"), #x, \ + #y), \ + Strutil::print( \ + "\tvalues were '{:.8g}' and '{:.8g}', diff was {:.8g}\n", (x), \ + (y), abs((x) - (y))), \ (void)++unit_test_failures)) diff --git a/src/libutil/simd_test.cpp b/src/libutil/simd_test.cpp index 99c377a303..ca67ef75d6 100644 --- a/src/libutil/simd_test.cpp +++ b/src/libutil/simd_test.cpp @@ -1611,8 +1611,8 @@ void test_mathfuncs () VEC expA = mkvec (0.367879441171442f, 1.0f, 2.718281828459045f, 90.0171313005218f); OIIO_CHECK_SIMD_EQUAL (exp(A), expA); OIIO_CHECK_SIMD_EQUAL_THRESH (log(expA), A, 1e-6f); - OIIO_CHECK_SIMD_EQUAL (fast_exp(A), - mkvec(fast_exp(A[0]), fast_exp(A[1]), fast_exp(A[2]), fast_exp(A[3]))); + OIIO_CHECK_SIMD_EQUAL_THRESH (fast_exp(A), + mkvec(fast_exp(A[0]), fast_exp(A[1]), fast_exp(A[2]), fast_exp(A[3])), 1e-5f); OIIO_CHECK_SIMD_EQUAL_THRESH (fast_log(expA), mkvec(fast_log(expA[0]), fast_log(expA[1]), fast_log(expA[2]), fast_log(expA[3])), 0.00001f); OIIO_CHECK_SIMD_EQUAL_THRESH (fast_pow_pos(VEC(2.0f), A), @@ -1939,6 +1939,25 @@ test_matrix() +static void +test_trivially_copyable() +{ + print("\nTesting trivially_copyable on all SIMD classes\n"); + OIIO_CHECK_ASSERT(std::is_trivially_copyable::value); + OIIO_CHECK_ASSERT(std::is_trivially_copyable::value); + OIIO_CHECK_ASSERT(std::is_trivially_copyable::value); + OIIO_CHECK_ASSERT(std::is_trivially_copyable::value); + OIIO_CHECK_ASSERT(std::is_trivially_copyable::value); + OIIO_CHECK_ASSERT(std::is_trivially_copyable::value); + OIIO_CHECK_ASSERT(std::is_trivially_copyable::value); + OIIO_CHECK_ASSERT(std::is_trivially_copyable::value); + OIIO_CHECK_ASSERT(std::is_trivially_copyable::value); + OIIO_CHECK_ASSERT(std::is_trivially_copyable::value); + OIIO_CHECK_ASSERT(std::is_trivially_copyable::value); +} + + + int main(int argc, char* argv[]) { @@ -2094,6 +2113,7 @@ main(int argc, char* argv[]) test_special(); test_metaprogramming(); test_matrix(); + test_trivially_copyable(); std::cout << "\nTotal time: " << Strutil::timeintervalformat(timer()) << "\n";