From 0476985291eac25685885a5e4c68610dc5a09af5 Mon Sep 17 00:00:00 2001
From: Evan Harvey <57234914+e10harvey@users.noreply.github.com>
Date: Thu, 30 Mar 2023 07:00:27 -0600
Subject: [PATCH] Add half_t and bhalf_t limits (#5778)

* Add half_t and bhalf_t limits

* Try using constexpr

* Revert "Try using constexpr"

This reverts commit 1b399bdd43da2b648d5f7cedbade0dfff51f8e43.

* Fix norm_min_helper value type

* Add bias to epsilon when bhalf_t is float

* Remove bias. Prevent compiler from optimizing out cast.

* Fix typo

* Attempt to fix CI Werror

* core/unit_test: Add inline comment

* Add half_t docs
---
 core/src/Kokkos_Half.hpp             | 317 ++++++++++++++++++++++++++-
 core/unit_test/TestHalfOperators.hpp |  31 ++-
 2 files changed, 337 insertions(+), 11 deletions(-)
diff --git a/core/src/Kokkos_Half.hpp b/core/src/Kokkos_Half.hpp
index 9231fac5ff..82dd55549d 100644
--- a/core/src/Kokkos_Half.hpp
+++ b/core/src/Kokkos_Half.hpp
@@ -21,8 +21,10 @@
 #define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF
 #endif
 
-#include <type_traits>
 #include <Kokkos_Macros.hpp>
+#include <Kokkos_NumericTraits.hpp>
+
+#include <type_traits>
 #include <iosfwd>  // istream & ostream for extraction and insertion ops
 #include <string>
 
@@ -1005,6 +1007,319 @@ cast_from_bhalf(bhalf_t val) {
 #else
 #define KOKKOS_BHALF_T_IS_FLOAT false
 #endif  // KOKKOS_IMPL_BHALF_TYPE_DEFINED
+        ////////////// BEGIN HALF_T (binary16) limits //////////////
+        // clang-format off
+// '\brief:' below are from the libc definitions for float and double:
+// https://www.gnu.org/software/libc/manual/html_node/Floating-Point-Parameters.html
+//
+// The arithmetic encoding and equations below are derived from:
+// Ref1: https://en.wikipedia.org/wiki/Single-precision_floating-point_format
+// Ref2: https://en.wikipedia.org/wiki/Exponent_bias
+// Ref3; https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html
+//
+// Some background on the magic numbers 2**10=1024 and 2**15=32768 used below:
+//
+// IMPORTANT: For IEEE754 encodings, see Ref1.
+//
+// For binary16, we have B = 2 and p = 16 with 2**16 possible significands.
+// The binary16 format is: [s  e  e  e  e  e  f f f f f f f f f f]
+//              bit index:  15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+// s: signed bit (1 bit)
+// e: exponent bits (5 bits)
+// f: fractional bits (10 bits)
+//
+// E_bias      = 2**(n_exponent_bits - 1) - 1 = 2**(5 - 1) - 1 = 15
+// E_subnormal = 00000 (base2)
+// E_infinity  = 11111 (base2)
+// E_min       = 1 - E_bias = 1 - 15
+// E_max       = 2**5 - 1 - E_bias = 2**5 - 1 - 15 = 16
+//
+// 2**10=1024 is the smallest denominator that is representable in binary16:
+// [s  e  e  e  e  e  f f f f f f f f f f]
+// [0  0  0  0  0  0  0 0 0 0 0 0 0 0 0 1]
+// which is: 1 / 2**-10
+//
+//
+// 2**15 is the largest exponent factor representable in binary16, for example the
+// largest integer value representable in binary16 is:
+// [s  e  e  e  e  e  f f f f f f f f f f]
+// [0  1  1  1  1  0  1 1 1 1 1 1 1 1 1 1]
+// which is: 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1)) =
+//           2**15 * (1 + 0.9990234375) =
+//           65504.0
+//
+
+/// \brief: Infinity.
+///
+/// base2 encoding: bits [10,14] set
+/// #define KOKKOS_IMPL_HALF_T_HUGE_VALH 0x7c00
+/// Binary16 encoding:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [0  1  1  1  1  1  0 0 0 0 0 0 0 0 0 0]
+/// bit index:  15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+
+/// \brief: Minimum normalized number
+///
+/// Stdc defines this as the smallest number (representable in binary16).
+///
+/// Binary16 encoding:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [1  1  1  1  1  0  1 1 1 1 1 1 1 1 1 1]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+///
+/// and in base10: -1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1)
+///              = -2**15 * (1 + (2**10 - 1) / 2**10)
+template <>
+struct Kokkos::Experimental::Impl::finite_min_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr float value = -65504.0F;
+};
+
+/// \brief: Maximum normalized number
+///
+/// Stdc defines this as the maximum number (representable in binary16).
+///
+/// Binary16 encoding:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [0  1  1  1  1  0  1 1 1 1 1 1 1 1 1 1]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+///
+/// and in base10: 1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 2**-10 + 2**-9 + 2**-8 + 2**-7 + 2**-6 + 2**-5 + 2**-4 + 2**-3 + 2**-2 + 2**-1)
+///              = 2**15 * (1 + (2**10 - 1) / 2**10)
+template <>
+struct Kokkos::Experimental::Impl::finite_max_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr float value = 65504.0F;
+};
+
+/// \brief: This is the difference between 1 and the smallest floating point
+///         number of type binary16 that is greater than 1
+///
+/// Smallest number in binary16 that is greater than 1 encoding:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [0  0  1  1  1  1  0 0 0 0 0 0 0 0 0 1]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+///
+/// and in base10: 1 * 2**(2**3 + 2**2 + 2**1 + 2**0 - 15) * (1 + 2**-10)
+///                = 2**0 * (1 + 2**-10)
+///                = 1.0009765625
+///
+/// Lastly, 1 - 1.0009765625 = 0.0009765625.
+template <>
+struct Kokkos::Experimental::Impl::epsilon_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr float value = 0.0009765625F;
+};
+
+/// @brief: The largest possible rounding error in ULPs
+///
+/// This simply uses the maximum rounding error.
+///
+/// Reference: https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html#689
+template <>
+struct Kokkos::Experimental::Impl::round_error_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr float value = 0.5F;
+};
+
+/// \brief: Minimum normalized positive half precision number
+///
+/// Stdc defines this as the minimum normalized positive floating
+/// point number that is representable in type binary16
+///
+/// Smallest number in binary16 that is greater than 1 encoding:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [0  0  0  0  0  1  0 0 0 0 0 0 0 0 0 0]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+///
+/// and in base10: 1 * 2**(2**0 - 15) * (1)
+///                = 2**-14
+template <>
+struct Kokkos::Experimental::Impl::norm_min_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr float value = 0.00006103515625F;
+};
+
+/// \brief: Quiet not a half precision number
+///
+/// IEEE 754 defines this as all exponent bits high.
+///
+/// Quiet NaN in binary16:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [1  1  1  1  1  1  0 0 0 0 0 0 0 0 0 0]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+template <>
+struct Kokkos::Experimental::Impl::quiet_NaN_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr float value = 0xfc000;
+};
+
+/// \brief: Signaling not a half precision number
+///
+/// IEEE 754 defines this as all exponent bits and the first fraction bit high.
+///
+/// Quiet NaN in binary16:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [1  1  1  1  1  1  1 0 0 0 0 0 0 0 0 0]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+template <>
+struct Kokkos::Experimental::Impl::signaling_NaN_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr float value = 0xfe000;
+};
+
+/// \brief: Number of digits in the matissa that can be represented
+///         without losing precision.
+///
+/// Stdc defines this as the number of base-RADIX digits in the floating point mantissa for the binary16 data type.
+///
+/// In binary16, we have 10 fractional bits plus the implicit leading 1.
+template <>
+struct Kokkos::Experimental::Impl::digits_helper<Kokkos::Experimental::half_t> {
+  static constexpr int value = 11;
+};
+
+/// \brief: "The number of base-10 digits that can be represented by the type T without change"
+/// Reference: https://en.cppreference.com/w/cpp/types/numeric_limits/digits10.
+///
+/// "For base-radix types, it is the value of digits() (digits - 1 for floating-point types) multiplied by log10(radix) and rounded down."
+/// Reference: https://en.cppreference.com/w/cpp/types/numeric_limits/digits10.
+///
+/// This is: floor(11 - 1 * log10(2))
+template <>
+struct Kokkos::Experimental::Impl::digits10_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr int value = 3;
+};
+
+/// \brief: Value of the base of the exponent representation.
+///
+/// Stdc defined this as the value of the base, or radix, of the exponent representation.
+template <>
+struct Kokkos::Experimental::Impl::radix_helper<Kokkos::Experimental::half_t> {
+  static constexpr int value = 2;
+};
+
+/// \brief: This is the smallest possible exponent value
+///
+/// Stdc defines this as the smallest possible exponent value for type binary16. 
+/// More precisely, it is the minimum negative integer such that the value min_exponent_helper
+/// raised to this power minus 1 can be represented as a normalized floating point number of type float.
+///
+/// In binary16:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [0  0  0  0  0  1  0 0 0 0 0 0 0 0 0 0]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+/// 
+/// and in base10: 1 * 2**(2**0 - 15) * (1 + 0)
+///                = 2**-14
+/// 
+/// with a bias of one from (C11 5.2.4.2.2), gives -13;
+template <>
+struct Kokkos::Experimental::Impl::min_exponent_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr int value = -13;
+};
+
+/// \brief: This is the largest possible exponent value
+///
+/// In binary16:
+///             [s  e  e  e  e  e  f f f f f f f f f f]
+///             [0  1  1  1  1  0  0 0 0 0 0 0 0 0 0 0]
+/// bit index:   15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+/// 
+/// and in base10: 1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 0)
+///                = 2**(30 - 15)
+///                = 2**15
+/// 
+/// with a bias of one from (C11 5.2.4.2.2), gives 16;
+template <>
+struct Kokkos::Experimental::Impl::max_exponent_helper<
+    Kokkos::Experimental::half_t> {
+  static constexpr int value = 16;
+};
+#endif
+////////////// END HALF_T (binary16) limits //////////////
+
+////////////// BEGIN BHALF_T (bfloat16) limits //////////////
+#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
+// Minimum normalized number
+template <>
+struct Kokkos::Experimental::Impl::finite_min_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr float value = -3.38953139e38;
+};
+// Maximum normalized number
+template <>
+struct Kokkos::Experimental::Impl::finite_max_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr float value = 3.38953139e38;
+};
+// 1/2^7
+template <>
+struct Kokkos::Experimental::Impl::epsilon_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr float value = 0.0078125F;
+};
+template <>
+struct Kokkos::Experimental::Impl::round_error_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr float value = 0.5F;
+};
+// Minimum normalized positive bhalf number
+template <>
+struct Kokkos::Experimental::Impl::norm_min_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr float value = 1.1754494351e-38;
+};
+// Quiet not a bhalf number
+template <>
+struct Kokkos::Experimental::Impl::quiet_NaN_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr float value = 0x7fc000;
+};
+// Signaling not a bhalf number
+template <>
+struct Kokkos::Experimental::Impl::signaling_NaN_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr float value = 0x7fe000;
+};
+// Number of digits in the matissa that can be represented
+// without losing precision.
+template <>
+struct Kokkos::Experimental::Impl::digits_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr int value = 2;
+};
+// 7 - 1 * log10(2)
+template <>
+struct Kokkos::Experimental::Impl::digits10_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr int value = 1;
+};
+// Value of the base of the exponent representation.
+template <>
+struct Kokkos::Experimental::Impl::radix_helper<Kokkos::Experimental::bhalf_t> {
+  static constexpr int value = 2;
+};
+// This is the smallest possible exponent value
+// with a bias of one (C11 5.2.4.2.2).
+template <>
+struct Kokkos::Experimental::Impl::min_exponent_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr int value = -125;
+};
+// This is the largest possible exponent value
+// with a bias of one (C11 5.2.4.2.2).
+template <>
+struct Kokkos::Experimental::Impl::max_exponent_helper<
+    Kokkos::Experimental::bhalf_t> {
+  static constexpr int value = 128;
+};
+#endif
+////////////// END BHALF_T (bfloat16) limits //////////////
+
 #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF
 #undef KOKKOS_IMPL_PUBLIC_INCLUDE
 #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_HALF
diff --git a/core/unit_test/TestHalfOperators.hpp b/core/unit_test/TestHalfOperators.hpp
index 6a2bc359e5..29844a3c6a 100644
--- a/core/unit_test/TestHalfOperators.hpp
+++ b/core/unit_test/TestHalfOperators.hpp
@@ -17,8 +17,6 @@
 #ifndef TESTHALFOPERATOR_HPP_
 #define TESTHALFOPERATOR_HPP_
 namespace Test {
-#define FP16_EPSILON 0.0009765625F  // 1/2^10
-#define BF16_EPSILON 0.0078125F     // 1/2^7
 using namespace Kokkos::Experimental;
 using ExecutionSpace = TEST_EXECSPACE;
 using ScalarType     = double;
@@ -26,9 +24,19 @@ using ViewType       = Kokkos::View<ScalarType*, ExecutionSpace>;
 using ViewTypeHost   = Kokkos::View<ScalarType*, Kokkos::HostSpace>;
 KOKKOS_FUNCTION
 const half_t& accept_ref(const half_t& a) { return a; }
+KOKKOS_FUNCTION
+double accept_ref_expected(const half_t& a) {
+  double tmp = static_cast<double>(a);
+  return tmp;
+}
 #if !KOKKOS_BHALF_T_IS_FLOAT
 KOKKOS_FUNCTION
 const bhalf_t& accept_ref(const bhalf_t& a) { return a; }
+KOKKOS_FUNCTION
+double accept_ref_expected(const bhalf_t& a) {
+  double tmp = static_cast<double>(a);
+  return tmp;
+}
 #endif  // !KOKKOS_BHALF_T_IS_FLOAT
 
 enum OP_TESTS {
@@ -886,8 +894,16 @@ struct Functor_TestHalfOperators {
     // actual_lhs(TW)   = h_lhs <=> h_rhs;  // Need C++20?
     // expected_lhs(TW) = d_lhs <=> d_rhs;  // Need C++20?
 
-    actual_lhs(PASS_BY_REF)   = static_cast<double>(accept_ref(h_lhs));
-    expected_lhs(PASS_BY_REF) = d_lhs;
+    actual_lhs(PASS_BY_REF) = static_cast<double>(accept_ref(h_lhs));
+
+    // Use accept_ref and accept_ref_expected to ensure the compiler
+    // does not optimize out the casts half_type -> double -> half_type.
+    // Note that these casts are accompanied by rounding. For the bhalf_t
+    // epsilon, these rounding policies used for casting is enough to cause
+    // the unit tests to fail.
+    // In short, one cannot simply assign static_cast<double>(h_lhs) to
+    // expected_lhs(PASS_BY_REF).
+    expected_lhs(PASS_BY_REF) = accept_ref_expected(h_lhs);
 
     half_tmp = static_cast<float>(h_lhs);
     tmp_ptr  = &(tmp_lhs = half_tmp);
@@ -910,12 +926,7 @@ struct Functor_TestHalfOperators {
 
 template <class half_type>
 void __test_half_operators(half_type h_lhs, half_type h_rhs) {
-  double epsilon = FLT_EPSILON;
-
-  if (std::is_same<half_type, Kokkos::Experimental::half_t>::value)
-    epsilon = FP16_EPSILON;
-  if (std::is_same<half_type, Kokkos::Experimental::bhalf_t>::value)
-    epsilon = BF16_EPSILON;
+  double epsilon = Kokkos::Experimental::epsilon<half_type>::value;
 
   Functor_TestHalfOperators<ViewType, half_type> f_device(h_lhs, h_rhs);
   Functor_TestHalfOperators<ViewTypeHost, half_type> f_host(h_lhs, h_rhs);