From 68666db994d57fb118b7cd9b24ce29db2d257fb4 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sat, 16 Mar 2024 09:00:42 -0700
Subject: [PATCH] refactor(simd.h): use default for ctrs and assignment where
 applicable (#4187)

simd.h improvements and refactorization for consistency:

* Use `= default` for all the uninitialized constructors, copy
constructors, and assignment from the same type.
* Fix the `operator=` to be more standard by returning a `vec&` instead
of `const vec&`, which apparently is unusual.
* Test that these all result in C++ understanding that these classes are
trivially copyable.

There is a bit of awkwardness where for the vint types, we have to
define the copy constructors for the Intel icc compiler. It just seems
to do the wrong thing for `=default` and fails lots of our tests. I
don't understand why, and I'm not inclined to investigate too deeply,
since it's a discontinued compiler we support only for legacy reasons,
so it doesn't bother me too much that we special-case it. One reason I
think it's a compiler bug is that it has no problem with the floating
point vector types. It's confused about something with the simd ints.

Disable SIMD acceleration of fast_exp2 when using MSVS prior to 2022.
It seems to sporadically generate bad code in this function and I
can't seem to restructure the function in a way that consistently is
correct. Doesn't seem to happen with newer MSVS, nor on any other
platform or compiler. I'm assuming this is a compiler bug and it's not
worth pursuing for an old version of one compiler, for this one
function that isn't really used all that often or in
performance-critical paths.

Some extra slosh in the simd fast_exp test.

Rewrite OIIO_CHECK_SIMD_EQUAL_THRESH to print the differences and use
more digits of precision when it fails.

---------

Signed-off-by: Larry Gritz <lg@larrygritz.com>
---
 .github/workflows/ci.yml           |   6 +-
 src/cmake/compiler.cmake           |   2 +-
 src/include/OpenImageIO/fmath.h    |   2 +-
 src/include/OpenImageIO/simd.h     | 203 +++++++++++++----------------
 src/include/OpenImageIO/unittest.h |  21 +--
 src/libutil/simd_test.cpp          |  24 +++-
 6 files changed, 132 insertions(+), 126 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 034fa373b4..3c8585c695 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -547,15 +547,15 @@ jobs:
             generator: "Visual Studio 16 2019"
             openexr_ver: v3.2.2
             python_ver: 3.7
-            simd: sse4.2
+            # simd: sse4.2
           # - desc: windows-2022
           #   runner: windows-2022
           #   vsver: 2022
           #   generator: "Visual Studio 17 2022"
-          #   openexr_ver: main
+          #   openexr_ver: v3.2.2
           #   # v3.1.4
           #   python_ver: 3.7
-          #   simd: sse4.2
+          #   # simd: AVX2
     runs-on: ${{ matrix.runner }}
     env:
       PYTHON_VERSION: ${{matrix.python_ver}}
diff --git a/src/cmake/compiler.cmake b/src/cmake/compiler.cmake
index e8f3061678..bb2ee0f445 100644
--- a/src/cmake/compiler.cmake
+++ b/src/cmake/compiler.cmake
@@ -300,8 +300,8 @@ endif ()
 #
 set (USE_SIMD "" CACHE STRING "Use SIMD directives (0, sse2, sse3, ssse3, sse4.1, sse4.2, avx, avx2, avx512f, f16c, aes)")
 set (SIMD_COMPILE_FLAGS "")
+message (STATUS "Compiling with SIMD level ${USE_SIMD}")
 if (NOT USE_SIMD STREQUAL "")
-    message (STATUS "Compiling with SIMD level ${USE_SIMD}")
     if (USE_SIMD STREQUAL "0")
         set (SIMD_COMPILE_FLAGS ${SIMD_COMPILE_FLAGS} "-DOIIO_NO_SIMD=1")
     else ()
diff --git a/src/include/OpenImageIO/fmath.h b/src/include/OpenImageIO/fmath.h
index eb276480b9..e0467a4b55 100644
--- a/src/include/OpenImageIO/fmath.h
+++ b/src/include/OpenImageIO/fmath.h
@@ -1852,7 +1852,7 @@ template<typename T>
 OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp2 (const T& xval) {
     using namespace simd;
     typedef typename T::vint_t intN;
-#if OIIO_SIMD_SSE
+#if OIIO_SIMD_SSE && !OIIO_MSVS_BEFORE_2022
     // See float specialization for explanations
     T x = clamp (xval, T(-126.0f), T(126.0f));
     intN m (x); x -= T(m);
diff --git a/src/include/OpenImageIO/simd.h b/src/include/OpenImageIO/simd.h
index 7aaf578ba5..c80247b929 100644
--- a/src/include/OpenImageIO/simd.h
+++ b/src/include/OpenImageIO/simd.h
@@ -531,7 +531,7 @@ class vbool4 {
     static constexpr size_t size() noexcept { return elements; }
 
     /// Default constructor (contents undefined)
-    vbool4 () { }
+    vbool4() = default;
 
     /// Construct from a single value (store it in all slots)
     vbool4 (bool a) { load(a); }
@@ -542,7 +542,7 @@ class vbool4 {
     vbool4 (bool a, bool b, bool c, bool d) { load (a, b, c, d); }
 
     /// Copy construct from another vbool4
-    vbool4 (const vbool4 &other) { m_simd = other.m_simd; }
+    vbool4(const vbool4 &other) = default;
 
     /// Construct from 4 int values
     vbool4 (int a, int b, int c, int d) {
@@ -576,10 +576,10 @@ class vbool4 {
     static const vbool4 True ();
 
     /// Assign one value to all components
-    const vbool4 & operator= (bool a) { load(a); return *this; }
+    vbool4& operator=(bool a) { load(a); return *this; }
 
     /// Assignment of another vbool4
-    const vbool4 & operator= (const vbool4 & other);
+    vbool4& operator=(const vbool4& other) = default;
 
     /// Component access (get)
     int operator[] (int i) const;
@@ -673,7 +673,7 @@ class vbool8 {
     static constexpr size_t size() noexcept { return elements; }
 
     /// Default constructor (contents undefined)
-    vbool8 () { }
+    vbool8() = default;
 
     /// Construct from a single value (store it in all slots)
     vbool8 (bool a) { load (a); }
@@ -684,7 +684,7 @@ class vbool8 {
     vbool8 (bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h);
 
     /// Copy construct from another vbool8
-    vbool8 (const vbool8 &other) { m_simd = other.m_simd; }
+    vbool8(const vbool8 &other) = default;
 
     /// Construct from 8 int values
     vbool8 (int a, int b, int c, int d, int e, int f, int g, int h);
@@ -719,10 +719,10 @@ class vbool8 {
     static const vbool8 True ();
 
     /// Assign one value to all components
-    const vbool8 & operator= (bool a);
+    vbool8& operator=(bool a) { load(a); return *this; }
 
     /// Assignment of another vbool8
-    const vbool8 & operator= (const vbool8 & other);
+    vbool8& operator=(const vbool8& other) = default;
 
     /// Component access (get)
     int operator[] (int i) const;
@@ -822,7 +822,7 @@ class vbool16 {
     static constexpr size_t size() noexcept { return elements; }
 
     /// Default constructor (contents undefined)
-    vbool16 () { }
+    vbool16() = default;
 
     /// Construct from a single value (store it in all slots)
     vbool16 (bool a) { load (a); }
@@ -836,7 +836,7 @@ class vbool16 {
             bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15);
 
     /// Copy construct from another vbool16
-    vbool16 (const vbool16 &other) { m_simd = other.m_simd; }
+    vbool16(const vbool16 &other) = default;
 
     /// Construct from 16 int values
     vbool16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
@@ -874,10 +874,10 @@ class vbool16 {
     static const vbool16 True ();
 
     /// Assign one value to all components
-    const vbool16 & operator= (bool a);
+    vbool16& operator=(bool a) { load(a); return *this; }
 
     /// Assignment of another vbool16
-    const vbool16 & operator= (const vbool16 & other);
+    vbool16& operator=(const vbool16& other) = default;
 
     /// Component access (get)
     int operator[] (int i) const;
@@ -971,7 +971,7 @@ class vint4 {
     static constexpr size_t size() noexcept { return elements; }
 
     /// Default constructor (contents undefined)
-    vint4 () { }
+    vint4() = default;
 
     /// Construct from a single value (store it in all slots)
     vint4 (int a);
@@ -998,7 +998,7 @@ class vint4 {
     explicit vint4 (const char *vals);
 
     /// Copy construct from another vint4
-    vint4 (const vint4 & other) { m_simd = other.m_simd; }
+    vint4(const vint4& other) = default;
 
     /// Convert a vfloat to an vint. Equivalent to i = (int)f;
     explicit vint4 (const vfloat4& f); // implementation below
@@ -1035,10 +1035,15 @@ class vint4 {
     static const vint4 Giota ();
 
     /// Assign one value to all components.
-    const vint4 & operator= (int a);
+    vint4& operator=(int a) { load(a); return *this; }
 
     /// Assignment from another vint4
-    const vint4 & operator= (const vint4& other) ;
+#if !defined(__INTEL_COMPILER)
+    vint4& operator=(const vint4& other) = default;
+#else
+    // For explanation of the necessity of this, see implementation comment.
+    vint4& operator=(const vint4& other);
+#endif
 
     /// Component access (get)
     int operator[] (int i) const;
@@ -1262,7 +1267,7 @@ class vint8 {
     static constexpr size_t size() noexcept { return elements; }
 
     /// Default constructor (contents undefined)
-    vint8 () { }
+    vint8() = default;
 
     /// Construct from a single value (store it in all slots)
     vint8 (int a);
@@ -1289,7 +1294,7 @@ class vint8 {
     explicit vint8 (const char *vals);
 
     /// Copy construct from another vint8
-    vint8 (const vint8 & other) { m_simd = other.m_simd; }
+    vint8(const vint8& other) = default;
 
     /// Convert a vfloat8 to an vint8. Equivalent to i = (int)f;
     explicit vint8 (const vfloat8& f); // implementation below
@@ -1329,10 +1334,15 @@ class vint8 {
     static const vint8 Giota ();
 
     /// Assign one value to all components.
-    const vint8 & operator= (int a);
+    vint8& operator=(int a) { load(a); return *this; }
 
     /// Assignment from another vint8
-    const vint8 & operator= (const vint8& other) ;
+#if !defined(__INTEL_COMPILER)
+    vint8& operator=(const vint8& other) = default;
+#else
+    // For explanation of the necessity of this, see implementation comment.
+    vint8& operator=(const vint8& other);
+#endif
 
     /// Component access (get)
     int operator[] (int i) const;
@@ -1561,7 +1571,7 @@ class vint16 {
     static constexpr size_t size() noexcept { return elements; }
 
     /// Default constructor (contents undefined)
-    vint16 () { }
+    vint16() = default;
 
     /// Construct from a single value (store it in all slots)
     vint16 (int a);
@@ -1586,7 +1596,7 @@ class vint16 {
     explicit vint16 (const char *vals);
 
     /// Copy construct from another vint16
-    vint16 (const vint16 & other) { m_simd = other.m_simd; }
+    vint16(const vint16& other) = default;
 
     /// Convert a vfloat16 to an vint16. Equivalent to i = (int)f;
     explicit vint16 (const vfloat16& f); // implementation below
@@ -1629,10 +1639,15 @@ class vint16 {
     static const vint16 Giota ();
 
     /// Assign one value to all components.
-    const vint16 & operator= (int a);
+    vint16& operator=(int a) { load(a); return *this; }
 
     /// Assignment from another vint16
-    const vint16 & operator= (const vint16& other) ;
+#if !defined(__INTEL_COMPILER)
+    vint16& operator=(const vint16& other) = default;
+#else
+    // For explanation of the necessity of this, see implementation comment.
+    vint16& operator=(const vint16& other);
+#endif
 
     /// Component access (get)
     int operator[] (int i) const;
@@ -1873,7 +1888,7 @@ class vfloat4 {
     static constexpr size_t size() noexcept { return elements; }
 
     /// Default constructor (contents undefined)
-    vfloat4 () { }
+    vfloat4() = default;
 
     /// Construct from a single value (store it in all slots)
     vfloat4 (float a) { load(a); }
@@ -1885,7 +1900,7 @@ class vfloat4 {
     vfloat4 (const float *f) { load (f); }
 
     /// Copy construct from another vfloat4
-    vfloat4 (const vfloat4 &other) { m_simd = other.m_simd; }
+    vfloat4(const vfloat4 &other) = default;
 
     /// Construct from an vint4 (promoting all components to float)
     explicit vfloat4 (const vint4& ival);
@@ -1940,13 +1955,10 @@ class vfloat4 {
 #endif
 
     /// Assign a single value to all components
-    const vfloat4 & operator= (float a) { load(a); return *this; }
+    vfloat4& operator=(float a) { load(a); return *this; }
 
     /// Assign a vfloat4
-    const vfloat4 & operator= (vfloat4 other) {
-        m_simd = other.m_simd;
-        return *this;
-    }
+    vfloat4& operator=(const vfloat4& other) = default;
 
     /// Return a vfloat4 with all components set to 0.0
     static const vfloat4 Zero ();
@@ -2254,7 +2266,7 @@ class vfloat3 : public vfloat4 {
     static constexpr size_t size() noexcept { return elements; }
 
     /// Default constructor (contents undefined)
-    vfloat3 () { }
+    vfloat3() = default;
 
     /// Construct from a single value (store it in all slots)
     vfloat3 (float a) { load(a); }
@@ -2278,7 +2290,7 @@ class vfloat3 : public vfloat4 {
     vfloat3(const V& v) : vfloat3(v[0], v[1], v[2]) { }
 
     /// Copy construct from another vfloat3
-    vfloat3 (const vfloat3 &other);
+    vfloat3(const vfloat3& other) = default;
 
     /// Construct from a vfloat4. Note: it will not zero out the internal
     /// 4th component, but rather accept on faith that the vfloat4 you are
@@ -2315,7 +2327,7 @@ class vfloat3 : public vfloat4 {
 #endif
 
     /// Assign a single value to all components
-    const vfloat3 & operator= (float a) { load(a); return *this; }
+    vfloat3& operator=(float a) { load(a); return *this; }
 
     /// Return a vfloat3 with all components set to 0.0
     static const vfloat3 Zero ();
@@ -2421,15 +2433,10 @@ class matrix44 {
     static constexpr int elements = 16;
 
     // Uninitialized
-    OIIO_FORCEINLINE matrix44() { }
+    OIIO_FORCEINLINE matrix44() = default;
 
     /// Copy constructor
-    OIIO_FORCEINLINE matrix44 (const matrix44 &M) {
-        m_row[0] = M[0];
-        m_row[1] = M[1];
-        m_row[2] = M[2];
-        m_row[3] = M[3];
-    }
+    OIIO_FORCEINLINE matrix44(const matrix44 &M) = default;
 
     /// Construct from a float array
     OIIO_FORCEINLINE explicit matrix44 (const float *f) {
@@ -2480,7 +2487,7 @@ class matrix44 {
     const vfloat4& operator[] (int i) const;
 
     /// Assignment
-    const matrix44& operator= (const matrix44& m);
+    matrix44& operator=(const matrix44& m) = default;
 
     /// Return the transposed matrix
     matrix44 transposed () const;
@@ -2556,7 +2563,7 @@ class vfloat8 {
     static constexpr size_t size() noexcept { return elements; }
 
     /// Default constructor (contents undefined)
-    vfloat8 () { }
+    vfloat8() = default;
 
     /// Construct from a single value (store it in all slots)
     vfloat8 (float a) { load(a); }
@@ -2569,7 +2576,7 @@ class vfloat8 {
     vfloat8 (const float *f) { load (f); }
 
     /// Copy construct from another vfloat8
-    vfloat8 (const vfloat8 &other) { m_simd = other.m_simd; }
+    vfloat8(const vfloat8 &other) = default;
 
     /// Construct from an int vector (promoting all components to float)
     explicit vfloat8 (const vint8& ival);
@@ -2607,13 +2614,10 @@ class vfloat8 {
 #endif
 
     /// Assign a single value to all components
-    const vfloat8& operator= (float a) { load(a); return *this; }
+    vfloat8& operator=(float a) { load(a); return *this; }
 
     /// Assign a vfloat8
-    const vfloat8& operator= (vfloat8 other) {
-        m_simd = other.m_simd;
-        return *this;
-    }
+    vfloat8& operator=(const vfloat8& other) = default;
 
     /// Return a vfloat8 with all components set to 0.0
     static const vfloat8 Zero ();
@@ -2877,7 +2881,7 @@ class vfloat16 {
     static constexpr size_t size() noexcept { return elements; }
 
     /// Default constructor (contents undefined)
-    vfloat16 () { }
+    vfloat16() = default;
 
     /// Construct from a single value (store it in all slots)
     vfloat16 (float a) { load(a); }
@@ -2892,7 +2896,7 @@ class vfloat16 {
     vfloat16 (const float *f) { load (f); }
 
     /// Copy construct from another vfloat16
-    vfloat16 (const vfloat16 &other) { m_simd = other.m_simd; }
+    vfloat16(const vfloat16 &other) = default;
 
     /// Construct from an int vector (promoting all components to float)
     explicit vfloat16 (const vint16& ival);
@@ -2933,13 +2937,10 @@ class vfloat16 {
 #endif
 
     /// Assign a single value to all components
-    const vfloat16& operator= (float a) { load(a); return *this; }
+    vfloat16& operator=(float a) { load(a); return *this; }
 
     /// Assign a vfloat16
-    const vfloat16& operator= (vfloat16 other) {
-        m_simd = other.m_simd;
-        return *this;
-    }
+    vfloat16& operator=(const vfloat16& other) = default;
 
     /// Return a vfloat16 with all components set to 0.0
     static const vfloat16 Zero ();
@@ -3314,11 +3315,6 @@ OIIO_FORCEINLINE vbool4::vbool4 (const bool *a) {
     load (a[0], a[1], a[2], a[3]);
 }
 
-OIIO_FORCEINLINE const vbool4& vbool4::operator= (const vbool4 & other) {
-    m_simd = other.m_simd;
-    return *this;
-}
-
 
 OIIO_FORCEINLINE int vbool4::bitmask () const {
 #if OIIO_SIMD_SSE
@@ -3683,16 +3679,6 @@ OIIO_FORCEINLINE vbool8::vbool8 (const bool *a) {
 }
 
 
-OIIO_FORCEINLINE const vbool8& vbool8::operator= (bool a) {
-    load(a);
-    return *this;
-}
-
-OIIO_FORCEINLINE const vbool8& vbool8::operator= (const vbool8 & other) {
-    m_simd = other.m_simd;
-    return *this;
-}
-
 OIIO_FORCEINLINE int vbool8::bitmask () const {
 #if OIIO_SIMD_AVX
     return _mm256_movemask_ps(m_simd);
@@ -4003,17 +3989,6 @@ OIIO_FORCEINLINE vbool16::vbool16 (const bool *a) {
 }
 
 
-OIIO_FORCEINLINE const vbool16& vbool16::operator= (bool a) {
-    load(a);
-    return *this;
-}
-
-OIIO_FORCEINLINE const vbool16& vbool16::operator= (const vbool16 & other) {
-    m_simd = other.m_simd;
-    return *this;
-}
-
-
 OIIO_FORCEINLINE int vbool16::bitmask () const {
     return static_cast<int>(m_simd);
 }
@@ -4164,10 +4139,17 @@ OIIO_FORCEINLINE bool none (const vbool16& v) { return reduce_or(v) == false; }
 //////////////////////////////////////////////////////////////////////
 // vint4 implementation
 
-OIIO_FORCEINLINE const vint4 & vint4::operator= (const vint4& other) {
+#if defined(__INTEL_COMPILER)
+// For reasons we don't understand, all sorts of failures crop up only on icc
+// if we make this =default. Although we still support icc for now, it's a
+// discontinued compiler, so we special-case it here rather than spend a lot
+// of time investigating what might be broken (and would of course never be
+// fixed if it's a compiler bug).
+OIIO_FORCEINLINE vint4& vint4::operator=(const vint4& other) {
     m_simd = other.m_simd;
     return *this;
 }
+#endif
 
 OIIO_FORCEINLINE int vint4::operator[] (int i) const {
     OIIO_DASSERT(i<elements);
@@ -4332,8 +4314,6 @@ OIIO_FORCEINLINE vint4::vint4 (const short *vals) { load(vals); }
 OIIO_FORCEINLINE vint4::vint4 (const unsigned char *vals) { load(vals); }
 OIIO_FORCEINLINE vint4::vint4 (const char *vals) { load(vals); }
 
-OIIO_FORCEINLINE const vint4 & vint4::operator= (int a) { load(a); return *this; }
-
 
 OIIO_FORCEINLINE void vint4::store (int *values) const {
 #if OIIO_SIMD_SSE
@@ -5059,10 +5039,17 @@ OIIO_FORCEINLINE vint4 safe_mod (const vint4& a, int b) {
 //////////////////////////////////////////////////////////////////////
 // vint8 implementation
 
-OIIO_FORCEINLINE const vint8 & vint8::operator= (const vint8& other) {
+#if defined(__INTEL_COMPILER)
+// For reasons we don't understand, all sorts of failures crop up only on icc
+// if we make this =default. Although we still support icc for now, it's a
+// discontinued compiler, so we special-case it here rather than spend a lot
+// of time investigating what might be broken (and would of course never be
+// fixed if it's a compiler bug).
+OIIO_FORCEINLINE vint8& vint8::operator=(const vint8& other) {
     m_simd = other.m_simd;
     return *this;
 }
+#endif
 
 OIIO_FORCEINLINE int vint8::operator[] (int i) const {
     OIIO_DASSERT(i<elements);
@@ -5214,8 +5201,6 @@ OIIO_FORCEINLINE vint8::vint8 (const short *vals) { load(vals); }
 OIIO_FORCEINLINE vint8::vint8 (const unsigned char *vals) { load(vals); }
 OIIO_FORCEINLINE vint8::vint8 (const char *vals) { load(vals); }
 
-OIIO_FORCEINLINE const vint8 & vint8::operator= (int a) { load(a); return *this; }
-
 
 OIIO_FORCEINLINE void vint8::store (int *values) const {
 #if OIIO_SIMD_AVX
@@ -5884,10 +5869,17 @@ OIIO_FORCEINLINE vint8 safe_mod (const vint8& a, int b) {
 //////////////////////////////////////////////////////////////////////
 // vint16 implementation
 
-OIIO_FORCEINLINE const vint16 & vint16::operator= (const vint16& other) {
+#if defined(__INTEL_COMPILER)
+// For reasons we don't understand, all sorts of failures crop up only on icc
+// if we make this =default. Although we still support icc for now, it's a
+// discontinued compiler, so we special-case it here rather than spend a lot
+// of time investigating what might be broken (and would of course never be
+// fixed if it's a compiler bug).
+OIIO_FORCEINLINE vint16& vint16::operator=(const vint16& other) {
     m_simd = other.m_simd;
     return *this;
 }
+#endif
 
 OIIO_FORCEINLINE int vint16::operator[] (int i) const {
     OIIO_DASSERT(i<elements);
@@ -6023,8 +6015,6 @@ OIIO_FORCEINLINE vint16::vint16 (const short *vals) { load(vals); }
 OIIO_FORCEINLINE vint16::vint16 (const unsigned char *vals) { load(vals); }
 OIIO_FORCEINLINE vint16::vint16 (const char *vals) { load(vals); }
 
-OIIO_FORCEINLINE const vint16 & vint16::operator= (int a) { load(a); return *this; }
-
 
 OIIO_FORCEINLINE void vint16::load_mask (const vbool16 &mask, const int *values) {
 #if OIIO_SIMD_AVX >= 512
@@ -8069,14 +8059,6 @@ OIIO_FORCEINLINE vint4 AxBxCxDx (const vint4& a, const vint4& b,
 //////////////////////////////////////////////////////////////////////
 // vfloat3 implementation
 
-OIIO_FORCEINLINE vfloat3::vfloat3 (const vfloat3 &other)  : vfloat4(other) {
-#if OIIO_SIMD_SSE || OIIO_SIMD_NEON
-    m_simd = other.m_simd;
-#else
-    SIMD_CONSTRUCT_PAD (other[i]);
-#endif
-}
-
 OIIO_FORCEINLINE vfloat3::vfloat3 (const vfloat4 &other) {
 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
     m_simd = other.simd();
@@ -8340,16 +8322,6 @@ OIIO_FORCEINLINE const vfloat4& matrix44::operator[] (int i) const {
 }
 
 
-OIIO_FORCEINLINE const matrix44& matrix44::operator= (const matrix44& m)
-{
-    m_row[0] = m[0];
-    m_row[1] = m[1];
-    m_row[2] = m[2];
-    m_row[3] = m[3];
-    return *this;
-}
-
-
 OIIO_FORCEINLINE matrix44 matrix44::transposed () const {
 #if OIIO_SIMD_SSE
     matrix44 T;
@@ -10360,6 +10332,19 @@ template<> struct formatter<OIIO::simd::matrix44>
     : OIIO::pvt::array_formatter<OIIO::simd::matrix44, float, 16> {};
 } // namespace fmt
 
+
+// Allow C++ metaprogramming to understand that the simd types are trivially
+// copyable (i.e. memcpy to copy simd types is fine).
+namespace std {  // not necessary in C++17, just say std::is_trivially_copyable
+#if defined(__INTEL_COMPILER)
+// Necessary because we have to define the vint types copy constructors on icc
+template<> struct is_trivially_copyable<OIIO::simd::vint4> : std::true_type {};
+template<> struct is_trivially_copyable<OIIO::simd::vint8> : std::true_type {};
+template<> struct is_trivially_copyable<OIIO::simd::vint16> : std::true_type {};
+#endif
+}  // namespace std
+
+
 #undef SIMD_DO
 #undef SIMD_CONSTRUCT
 #undef SIMD_CONSTRUCT_PAD
diff --git a/src/include/OpenImageIO/unittest.h b/src/include/OpenImageIO/unittest.h
index 6ffcc45d34..a605cc5dfd 100644
--- a/src/include/OpenImageIO/unittest.h
+++ b/src/include/OpenImageIO/unittest.h
@@ -231,16 +231,17 @@ static OIIO::pvt::UnitTestFailureCounter unit_test_failures;
                        << "'\n"),                                             \
             (void)++unit_test_failures))
 
-#define OIIO_CHECK_SIMD_EQUAL_THRESH(x, y, eps)                               \
-    (all(abs((x) - (y)) <= (eps))                                             \
-         ? ((void)0)                                                          \
-         : ((std::cout << OIIO::Sysutil::Term(std::cout).ansi("red,bold")     \
-                       << __FILE__ << ":" << __LINE__ << ":\n"                \
-                       << "FAILED: "                                          \
-                       << OIIO::Sysutil::Term(std::cout).ansi("normal") << #x \
-                       << " == " << #y << "\n"                                \
-                       << "\tvalues were '" << (x) << "' and '" << (y)        \
-                       << "'\n"),                                             \
+#define OIIO_CHECK_SIMD_EQUAL_THRESH(x, y, eps)                                \
+    ((all(abs((x) - (y)) <= eps))                                              \
+         ? ((void)0)                                                           \
+         : (Strutil::print("{}{}:{}:\nFAILED: {}{} == {}\n",                   \
+                           OIIO::Sysutil::Term(std::cout).ansi("red,bold"),    \
+                           __FILE__, __LINE__,                                 \
+                           OIIO::Sysutil::Term(std::cout).ansi("normal"), #x,  \
+                           #y),                                                \
+            Strutil::print(                                                    \
+                "\tvalues were '{:.8g}' and '{:.8g}', diff was {:.8g}\n", (x), \
+                (y), abs((x) - (y))),                                          \
             (void)++unit_test_failures))
 
 
diff --git a/src/libutil/simd_test.cpp b/src/libutil/simd_test.cpp
index 99c377a303..ca67ef75d6 100644
--- a/src/libutil/simd_test.cpp
+++ b/src/libutil/simd_test.cpp
@@ -1611,8 +1611,8 @@ void test_mathfuncs ()
     VEC expA = mkvec<VEC> (0.367879441171442f, 1.0f, 2.718281828459045f, 90.0171313005218f);
     OIIO_CHECK_SIMD_EQUAL (exp(A), expA);
     OIIO_CHECK_SIMD_EQUAL_THRESH (log(expA), A, 1e-6f);
-    OIIO_CHECK_SIMD_EQUAL (fast_exp(A),
-                mkvec<VEC>(fast_exp(A[0]), fast_exp(A[1]), fast_exp(A[2]), fast_exp(A[3])));
+    OIIO_CHECK_SIMD_EQUAL_THRESH (fast_exp(A),
+                mkvec<VEC>(fast_exp(A[0]), fast_exp(A[1]), fast_exp(A[2]), fast_exp(A[3])), 1e-5f);
     OIIO_CHECK_SIMD_EQUAL_THRESH (fast_log(expA),
                 mkvec<VEC>(fast_log(expA[0]), fast_log(expA[1]), fast_log(expA[2]), fast_log(expA[3])), 0.00001f);
     OIIO_CHECK_SIMD_EQUAL_THRESH (fast_pow_pos(VEC(2.0f), A),
@@ -1939,6 +1939,25 @@ test_matrix()
 
 
 
+static void
+test_trivially_copyable()
+{
+    print("\nTesting trivially_copyable on all SIMD classes\n");
+    OIIO_CHECK_ASSERT(std::is_trivially_copyable<vbool4>::value);
+    OIIO_CHECK_ASSERT(std::is_trivially_copyable<vint4>::value);
+    OIIO_CHECK_ASSERT(std::is_trivially_copyable<vfloat4>::value);
+    OIIO_CHECK_ASSERT(std::is_trivially_copyable<vfloat3>::value);
+    OIIO_CHECK_ASSERT(std::is_trivially_copyable<matrix44>::value);
+    OIIO_CHECK_ASSERT(std::is_trivially_copyable<vbool8>::value);
+    OIIO_CHECK_ASSERT(std::is_trivially_copyable<vint8>::value);
+    OIIO_CHECK_ASSERT(std::is_trivially_copyable<vfloat8>::value);
+    OIIO_CHECK_ASSERT(std::is_trivially_copyable<vbool16>::value);
+    OIIO_CHECK_ASSERT(std::is_trivially_copyable<vint16>::value);
+    OIIO_CHECK_ASSERT(std::is_trivially_copyable<vfloat16>::value);
+}
+
+
+
 int
 main(int argc, char* argv[])
 {
@@ -2094,6 +2113,7 @@ main(int argc, char* argv[])
     test_special();
     test_metaprogramming();
     test_matrix();
+    test_trivially_copyable();
 
     std::cout << "\nTotal time: " << Strutil::timeintervalformat(timer())
               << "\n";