From 0476985291eac25685885a5e4c68610dc5a09af5 Mon Sep 17 00:00:00 2001 From: Evan Harvey <57234914+e10harvey@users.noreply.github.com> Date: Thu, 30 Mar 2023 07:00:27 -0600 Subject: [PATCH] Add half_t and bhalf_t limits (#5778) * Add half_t and bhalf_t limits * Try using constexpr * Revert "Try using constexpr" This reverts commit 1b399bdd43da2b648d5f7cedbade0dfff51f8e43. * Fix norm_min_helper value type * Add bias to epsilon when bhalf_t is float * Remove bias. Prevent compiler from optimizing out cast. * Fix typo * Attempt to fix CI Werror * core/unit_test: Add inline comment * Add half_t docs --- core/src/Kokkos_Half.hpp | 317 ++++++++++++++++++++++++++- core/unit_test/TestHalfOperators.hpp | 31 ++- 2 files changed, 337 insertions(+), 11 deletions(-) diff --git a/core/src/Kokkos_Half.hpp b/core/src/Kokkos_Half.hpp index 9231fac5ff..82dd55549d 100644 --- a/core/src/Kokkos_Half.hpp +++ b/core/src/Kokkos_Half.hpp @@ -21,8 +21,10 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF #endif -#include #include +#include + +#include #include // istream & ostream for extraction and insertion ops #include @@ -1005,6 +1007,319 @@ cast_from_bhalf(bhalf_t val) { #else #define KOKKOS_BHALF_T_IS_FLOAT false #endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED + ////////////// BEGIN HALF_T (binary16) limits ////////////// + // clang-format off +// '\brief:' below are from the libc definitions for float and double: +// https://www.gnu.org/software/libc/manual/html_node/Floating-Point-Parameters.html +// +// The arithmetic encoding and equations below are derived from: +// Ref1: https://en.wikipedia.org/wiki/Single-precision_floating-point_format +// Ref2: https://en.wikipedia.org/wiki/Exponent_bias +// Ref3; https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html +// +// Some background on the magic numbers 2**10=1024 and 2**15=32768 used below: +// +// IMPORTANT: For IEEE754 encodings, see Ref1. +// +// For binary16, we have B = 2 and p = 16 with 2**16 possible significands. +// The binary16 format is: [s e e e e e f f f f f f f f f f] +// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +// s: signed bit (1 bit) +// e: exponent bits (5 bits) +// f: fractional bits (10 bits) +// +// E_bias = 2**(n_exponent_bits - 1) - 1 = 2**(5 - 1) - 1 = 15 +// E_subnormal = 00000 (base2) +// E_infinity = 11111 (base2) +// E_min = 1 - E_bias = 1 - 15 +// E_max = 2**5 - 1 - E_bias = 2**5 - 1 - 15 = 16 +// +// 2**10=1024 is the smallest denominator that is representable in binary16: +// [s e e e e e f f f f f f f f f f] +// [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1] +// which is: 1 / 2**-10 +// +// +// 2**15 is the largest exponent factor representable in binary16, for example the +// largest integer value representable in binary16 is: +// [s e e e e e f f f f f f f f f f] +// [0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1] +// which is: 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1)) = +// 2**15 * (1 + 0.9990234375) = +// 65504.0 +// + +/// \brief: Infinity. +/// +/// base2 encoding: bits [10,14] set +/// #define KOKKOS_IMPL_HALF_T_HUGE_VALH 0x7c00 +/// Binary16 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT + +/// \brief: Minimum normalized number +/// +/// Stdc defines this as the smallest number (representable in binary16). +/// +/// Binary16 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: -1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1) +/// = -2**15 * (1 + (2**10 - 1) / 2**10) +template <> +struct Kokkos::Experimental::Impl::finite_min_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = -65504.0F; +}; + +/// \brief: Maximum normalized number +/// +/// Stdc defines this as the maximum number (representable in binary16). +/// +/// Binary16 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1) +/// = 2**15 * (1 + (2**10 - 1) / 2**10) +template <> +struct Kokkos::Experimental::Impl::finite_max_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 65504.0F; +}; + +/// \brief: This is the difference between 1 and the smallest floating point +/// number of type binary16 that is greater than 1 +/// +/// Smallest number in binary16 that is greater than 1 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**3 + 2**2 + 2**1 + 2**0 - 15) * (1 + 2**-10) +/// = 2**0 * (1 + 2**-10) +/// = 1.0009765625 +/// +/// Lastly, 1 - 1.0009765625 = 0.0009765625. +template <> +struct Kokkos::Experimental::Impl::epsilon_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0.0009765625F; +}; + +/// @brief: The largest possible rounding error in ULPs +/// +/// This simply uses the maximum rounding error. +/// +/// Reference: https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#689 +template <> +struct Kokkos::Experimental::Impl::round_error_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0.5F; +}; + +/// \brief: Minimum normalized positive half precision number +/// +/// Stdc defines this as the minimum normalized positive floating +/// point number that is representable in type binary16 +/// +/// Smallest number in binary16 that is greater than 1 encoding: +/// [s e e e e e f f f f f f f f f f] +/// [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**0 - 15) * (1) +/// = 2**-14 +template <> +struct Kokkos::Experimental::Impl::norm_min_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0.00006103515625F; +}; + +/// \brief: Quiet not a half precision number +/// +/// IEEE 754 defines this as all exponent bits high. +/// +/// Quiet NaN in binary16: +/// [s e e e e e f f f f f f f f f f] +/// [1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +template <> +struct Kokkos::Experimental::Impl::quiet_NaN_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0xfc000; +}; + +/// \brief: Signaling not a half precision number +/// +/// IEEE 754 defines this as all exponent bits and the first fraction bit high. +/// +/// Quiet NaN in binary16: +/// [s e e e e e f f f f f f f f f f] +/// [1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +template <> +struct Kokkos::Experimental::Impl::signaling_NaN_helper< + Kokkos::Experimental::half_t> { + static constexpr float value = 0xfe000; +}; + +/// \brief: Number of digits in the matissa that can be represented +/// without losing precision. +/// +/// Stdc defines this as the number of base-RADIX digits in the floating point mantissa for the binary16 data type. +/// +/// In binary16, we have 10 fractional bits plus the implicit leading 1. +template <> +struct Kokkos::Experimental::Impl::digits_helper { + static constexpr int value = 11; +}; + +/// \brief: "The number of base-10 digits that can be represented by the type T without change" +/// Reference: https://en.cppreference.com/w/cpp/types/numeric_limits/digits10. +/// +/// "For base-radix types, it is the value of digits() (digits - 1 for floating-point types) multiplied by log10(radix) and rounded down." +/// Reference: https://en.cppreference.com/w/cpp/types/numeric_limits/digits10. +/// +/// This is: floor(11 - 1 * log10(2)) +template <> +struct Kokkos::Experimental::Impl::digits10_helper< + Kokkos::Experimental::half_t> { + static constexpr int value = 3; +}; + +/// \brief: Value of the base of the exponent representation. +/// +/// Stdc defined this as the value of the base, or radix, of the exponent representation. +template <> +struct Kokkos::Experimental::Impl::radix_helper { + static constexpr int value = 2; +}; + +/// \brief: This is the smallest possible exponent value +/// +/// Stdc defines this as the smallest possible exponent value for type binary16. +/// More precisely, it is the minimum negative integer such that the value min_exponent_helper +/// raised to this power minus 1 can be represented as a normalized floating point number of type float. +/// +/// In binary16: +/// [s e e e e e f f f f f f f f f f] +/// [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**0 - 15) * (1 + 0) +/// = 2**-14 +/// +/// with a bias of one from (C11 5.2.4.2.2), gives -13; +template <> +struct Kokkos::Experimental::Impl::min_exponent_helper< + Kokkos::Experimental::half_t> { + static constexpr int value = -13; +}; + +/// \brief: This is the largest possible exponent value +/// +/// In binary16: +/// [s e e e e e f f f f f f f f f f] +/// [0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 0) +/// = 2**(30 - 15) +/// = 2**15 +/// +/// with a bias of one from (C11 5.2.4.2.2), gives 16; +template <> +struct Kokkos::Experimental::Impl::max_exponent_helper< + Kokkos::Experimental::half_t> { + static constexpr int value = 16; +}; +#endif +////////////// END HALF_T (binary16) limits ////////////// + +////////////// BEGIN BHALF_T (bfloat16) limits ////////////// +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +// Minimum normalized number +template <> +struct Kokkos::Experimental::Impl::finite_min_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = -3.38953139e38; +}; +// Maximum normalized number +template <> +struct Kokkos::Experimental::Impl::finite_max_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 3.38953139e38; +}; +// 1/2^7 +template <> +struct Kokkos::Experimental::Impl::epsilon_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 0.0078125F; +}; +template <> +struct Kokkos::Experimental::Impl::round_error_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 0.5F; +}; +// Minimum normalized positive bhalf number +template <> +struct Kokkos::Experimental::Impl::norm_min_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 1.1754494351e-38; +}; +// Quiet not a bhalf number +template <> +struct Kokkos::Experimental::Impl::quiet_NaN_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 0x7fc000; +}; +// Signaling not a bhalf number +template <> +struct Kokkos::Experimental::Impl::signaling_NaN_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr float value = 0x7fe000; +}; +// Number of digits in the matissa that can be represented +// without losing precision. +template <> +struct Kokkos::Experimental::Impl::digits_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr int value = 2; +}; +// 7 - 1 * log10(2) +template <> +struct Kokkos::Experimental::Impl::digits10_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr int value = 1; +}; +// Value of the base of the exponent representation. +template <> +struct Kokkos::Experimental::Impl::radix_helper { + static constexpr int value = 2; +}; +// This is the smallest possible exponent value +// with a bias of one (C11 5.2.4.2.2). +template <> +struct Kokkos::Experimental::Impl::min_exponent_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr int value = -125; +}; +// This is the largest possible exponent value +// with a bias of one (C11 5.2.4.2.2). +template <> +struct Kokkos::Experimental::Impl::max_exponent_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr int value = 128; +}; +#endif +////////////// END BHALF_T (bfloat16) limits ////////////// + #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF diff --git a/core/unit_test/TestHalfOperators.hpp b/core/unit_test/TestHalfOperators.hpp index 6a2bc359e5..29844a3c6a 100644 --- a/core/unit_test/TestHalfOperators.hpp +++ b/core/unit_test/TestHalfOperators.hpp @@ -17,8 +17,6 @@ #ifndef TESTHALFOPERATOR_HPP_ #define TESTHALFOPERATOR_HPP_ namespace Test { -#define FP16_EPSILON 0.0009765625F // 1/2^10 -#define BF16_EPSILON 0.0078125F // 1/2^7 using namespace Kokkos::Experimental; using ExecutionSpace = TEST_EXECSPACE; using ScalarType = double; @@ -26,9 +24,19 @@ using ViewType = Kokkos::View; using ViewTypeHost = Kokkos::View; KOKKOS_FUNCTION const half_t& accept_ref(const half_t& a) { return a; } +KOKKOS_FUNCTION +double accept_ref_expected(const half_t& a) { + double tmp = static_cast(a); + return tmp; +} #if !KOKKOS_BHALF_T_IS_FLOAT KOKKOS_FUNCTION const bhalf_t& accept_ref(const bhalf_t& a) { return a; } +KOKKOS_FUNCTION +double accept_ref_expected(const bhalf_t& a) { + double tmp = static_cast(a); + return tmp; +} #endif // !KOKKOS_BHALF_T_IS_FLOAT enum OP_TESTS { @@ -886,8 +894,16 @@ struct Functor_TestHalfOperators { // actual_lhs(TW) = h_lhs <=> h_rhs; // Need C++20? // expected_lhs(TW) = d_lhs <=> d_rhs; // Need C++20? - actual_lhs(PASS_BY_REF) = static_cast(accept_ref(h_lhs)); - expected_lhs(PASS_BY_REF) = d_lhs; + actual_lhs(PASS_BY_REF) = static_cast(accept_ref(h_lhs)); + + // Use accept_ref and accept_ref_expected to ensure the compiler + // does not optimize out the casts half_type -> double -> half_type. + // Note that these casts are accompanied by rounding. For the bhalf_t + // epsilon, these rounding policies used for casting is enough to cause + // the unit tests to fail. + // In short, one cannot simply assign static_cast(h_lhs) to + // expected_lhs(PASS_BY_REF). + expected_lhs(PASS_BY_REF) = accept_ref_expected(h_lhs); half_tmp = static_cast(h_lhs); tmp_ptr = &(tmp_lhs = half_tmp); @@ -910,12 +926,7 @@ struct Functor_TestHalfOperators { template void __test_half_operators(half_type h_lhs, half_type h_rhs) { - double epsilon = FLT_EPSILON; - - if (std::is_same::value) - epsilon = FP16_EPSILON; - if (std::is_same::value) - epsilon = BF16_EPSILON; + double epsilon = Kokkos::Experimental::epsilon::value; Functor_TestHalfOperators f_device(h_lhs, h_rhs); Functor_TestHalfOperators f_host(h_lhs, h_rhs);