NVIDIA · cliffburdick · Jun 4, 2024 · Jun 4, 2024
diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ MatX support is currently limited to **Linux only** due to the time to test Wind
 
 **Note**: CUDA 12.0.0 through 12.2.0 have an issue that causes building MatX unit tests to show a compiler error or cause a segfault in the compiler. Please use CUDA 11.5-11.8 or CUDA 12.2.1+ with MatX.
 
-MatX is using features in C++17 and the latest CUDA compilers and libraries. For this reason, when running with GPU support, CUDA 11.5 and g++9 or clang 17 or newer is required. You can download the CUDA Toolkit [here](https://developer.nvidia.com/cuda-downloads).
+MatX is using features in C++17 and the latest CUDA compilers and libraries. For this reason, when running with GPU support, CUDA 11.5 and g++9, nvc++ 24.5, or clang 17 or newer is required. You can download the CUDA Toolkit [here](https://developer.nvidia.com/cuda-downloads).
 
 MatX has been tested on and supports Pascal, Turing, Volta, Ampere, Ada, and Hopper GPU architectures. Jetson products are supported with Jetpack 5.0 or above.
 

diff --git a/bench/00_operators/operators.cu b/bench/00_operators/operators.cu
@@ -87,7 +87,7 @@ NVBENCH_BENCH_TYPES(random, NVBENCH_TYPE_AXES(random_types));
 template<typename T> T factorial(int N) {
   T prod = 1;
   for(int i=2; i<=N; i++) {
-    prod = prod * i;
+    prod = prod * static_cast<T>(i);
   }
   return prod;
 }
@@ -99,7 +99,7 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list<ValueType>)
   int l = 5;
   int m = 4;
   int n = 600;
-  ValueType dx = M_PI/n;
+  ValueType dx = static_cast<ValueType>(M_PI/n);
 
   cudaExecutor exec{};
   auto col = range<0>({n+1},ValueType(0), ValueType(dx));
@@ -109,11 +109,11 @@ void sphericalharmonics(nvbench::state &state, nvbench::type_list<ValueType>)
 
   auto Plm = lcollapse<3>(legendre(l, m, cos(theta)));
 
-  ValueType a = (2*l+1)*factorial<ValueType>(l-m);
-  ValueType b = 4*M_PI*factorial<ValueType>(l+m);
+  ValueType a = static_cast<ValueType>(2*l+1)*factorial<ValueType>(l-m);
+  ValueType b = static_cast<ValueType>(4*M_PI)*factorial<ValueType>(l+m);
   ValueType C = cuda::std::sqrt(a/b);
 
-  auto Ylm = C * Plm * exp(cuda::std::complex<ValueType>(0,1)*(m*phi));
+  auto Ylm = C * Plm * exp(cuda::std::complex<ValueType>(0,1)*(static_cast<ValueType>(m)*phi));
   auto [ Xm, Ym, Zm ] = sph2cart(phi, ValueType(M_PI)/2 - theta, abs(real(Ylm)));
 
   // Work around C++17 restriction, structured bindings cannot be captured

diff --git a/bench/00_transform/conv.cu b/bench/00_transform/conv.cu
@@ -130,9 +130,9 @@ void conv2d_direct_batch(nvbench::state &state,
   flops.set_string("description", "Trillions of operations per second");
 
   if constexpr (is_complex_v<ValueType>) {
-    flops.set_float64("value", (double)2 * out.Size(2) * out.Size(1) * out.Size(0) * bt.Size(2) * bt.Size(1) * 4 / seconds / 1e12);
+    flops.set_float64("value", static_cast<double>(2 * out.Size(2) * out.Size(1) * out.Size(0) * bt.Size(2) * bt.Size(1) * 4) / seconds / 1e12);
   } else {
-    flops.set_float64("value", (double)2 * out.Size(2) * out.Size(1) * out.Size(0) * bt.Size(2) * bt.Size(1) / seconds / 1e12);
+    flops.set_float64("value", static_cast<double>(2 * out.Size(2) * out.Size(1) * out.Size(0) * bt.Size(2) * bt.Size(1)) / seconds / 1e12);
   }
 }
 NVBENCH_BENCH_TYPES(conv2d_direct_batch, NVBENCH_TYPE_AXES(conv_types));
diff --git a/bench/00_transform/qr.cu b/bench/00_transform/qr.cu
@@ -19,9 +19,9 @@ void qr_batch(nvbench::state &state,
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
   cudaExecutor exec{stream};
 
-  int batch = state.get_int64("batch");
-  int m = state.get_int64("rows");
-  int n = state.get_int64("cols");
+  int64_t batch = state.get_int64("batch");
+  int64_t m = state.get_int64("rows");
+  int64_t n = state.get_int64("cols");
 
   auto A = make_tensor<AType>({batch, m, n});
   auto Q = make_tensor<AType>({batch, m, m});

diff --git a/bench/00_transform/svd_power.cu b/bench/00_transform/svd_power.cu
@@ -19,11 +19,11 @@ void svdpi_batch(nvbench::state &state,
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
   cudaExecutor exec{stream};
 
-  int batch = state.get_int64("batch");
-  int m = state.get_int64("rows");
-  int n = state.get_int64("cols");
+  int64_t batch = state.get_int64("batch");
+  int64_t m = state.get_int64("rows");
+  int64_t n = state.get_int64("cols");
 
-  int r = std::min(n,m);
+  int64_t r = std::min(n,m);
   auto A = make_tensor<AType>({batch, m, n});
   auto U = make_tensor<AType>({batch, m, r});
   auto VT = make_tensor<AType>({batch, r, n});
@@ -68,11 +68,11 @@ void svdbpi_batch(nvbench::state &state,
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream));
   cudaExecutor exec{stream};
 
-  int batch = state.get_int64("batch");
-  int m = state.get_int64("rows");
-  int n = state.get_int64("cols");
+  int64_t batch = state.get_int64("batch");
+  int64_t m = state.get_int64("rows");
+  int64_t n = state.get_int64("cols");
 
-  int r = std::min(n,m);
+  int64_t r = std::min(n,m);
   auto A = make_tensor<AType>({batch, m, n});
   auto U = make_tensor<AType>({batch, m, r});
   auto VT = make_tensor<AType>({batch, r, n});

diff --git a/docs_input/build.rst b/docs_input/build.rst
@@ -19,7 +19,7 @@ the CPM_ documentation or the documentation for each package for more informatio
 
 System Requirements
 -------------------
-MatX requires **CUDA 11.5** or higher, and **g++ 9.3+** or **clang 17+** for the host compiler. See the CUDA toolkit documentation
+MatX requires **CUDA 11.5** or higher, and **g++ 9.3+**, **clang 17+**, or **nvc++ 24.5** for the host compiler. See the CUDA toolkit documentation
 for supported host compilers. Other requirements for optional components are listed below.
 
 .. warning:: Using MatX with an unsupported compiler may result in compiler and/or runtime errors.

diff --git a/include/matx/generators/range.h b/include/matx/generators/range.h
@@ -60,13 +60,6 @@ namespace matx
           else {
             return first_ + T(static_cast<T>(idx) * step_);
           }
-
-          if constexpr (!is_matx_half_v<T>) {
-            return first_ + T(static_cast<T>(idx) * step_);
-          }
-          else {
-            return first_ + T(static_cast<T>((float)idx) * step_);
-          }
         }
     };
   }

diff --git a/include/matx/operators/repmat.h b/include/matx/operators/repmat.h
@@ -105,15 +105,6 @@ namespace matx
             UpdateIndex(tup);
             return cuda::std::apply(op_, tup);
           }
-
-          if constexpr (Rank() != 0) {
-            auto tup = cuda::std::make_tuple(indices...);
-            UpdateIndex(tup);
-            return cuda::std::apply(op_, tup);
-          }
-          else {
-            return op_();
-          }
         }
 
         template <typename... Is>
@@ -127,15 +118,6 @@ namespace matx
             UpdateIndex(tup);
             return cuda::std::apply(op_, tup);
           }
-
-          if constexpr (Rank() != 0) {
-            auto tup = cuda::std::make_tuple(indices...);
-            UpdateIndex(tup);
-            return cuda::std::apply(op_, tup);
-          }
-          else {
-            return op_();
-          }
         }
 
         template <typename ShapeType, typename Executor>

diff --git a/include/matx/operators/reverse.h b/include/matx/operators/reverse.h
@@ -74,15 +74,6 @@ namespace matx
             cuda::std::get<DIM>(tup) = Size(DIM) - cuda::std::get<DIM>(tup) - 1;
             return cuda::std::apply(op_, tup);
           }
-
-          if constexpr (Rank() != 0) {
-            auto tup = cuda::std::make_tuple(indices...);
-            cuda::std::get<DIM>(tup) = Size(DIM) - cuda::std::get<DIM>(tup) - 1;
-            return cuda::std::apply(op_, tup);
-          } 
-          else {
-            return op_();
-          }
         }
 
         template <typename... Is>
@@ -96,15 +87,6 @@ namespace matx
             cuda::std::get<DIM>(tup) = Size(DIM) - cuda::std::get<DIM>(tup) - 1;
             return cuda::std::apply(op_, tup);
           }
-
-          if constexpr (Rank() != 0) {
-            auto tup = cuda::std::make_tuple(indices...);
-            cuda::std::get<DIM>(tup) = Size(DIM) - cuda::std::get<DIM>(tup) - 1;
-            return cuda::std::apply(op_, tup);
-          } 
-          else {
-            return op_();
-          }
         }
 
         static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()

diff --git a/include/matx/operators/scalar_ops.h b/include/matx/operators/scalar_ops.h
@@ -496,10 +496,7 @@ template <typename T1, typename T2> struct FModF {
   static std::string str(const std::string &str1, const std::string &str2) { return "(" + str1 + "%" + str2 + ")"; }
 
   static __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto op(T1 v1, T2 v2) { 
-    return _internal_fmod(v1, v2); 
-
-    // Unreachable, but required by the compiler
-    return typename std::invoke_result_t<decltype(op), T1, T2>{0};    
+    return _internal_fmod(v1, v2);  
   }
 };
 template <typename T1, typename T2> using FModOp = BinOp<T1, T2, FModF<T1, T2>>;
@@ -520,10 +517,7 @@ template <typename T1, typename T2> struct Atan2F {
   static std::string str(const std::string &str1, const std::string &str2) { return "(" + str1 + "%" + str2 + ")"; }
 
   static __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto op(T1 v1, T2 v2) { 
-    return _internal_atan2(v1, v2); 
-
-    // Unreachable, but required by the compiler
-    return typename std::invoke_result_t<decltype(op), T1, T2>{0};    
+    return _internal_atan2(v1, v2);  
   }
 };
 template <typename T1, typename T2> using Atan2Op = BinOp<T1, T2, Atan2F<T1, T2>>;
@@ -649,8 +643,6 @@ static __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto _internal_isnan(T v1)
   } else {
     return cuda::std::isnan(static_cast<castType>(v1));
   }
-
-  return false;  
 }
 template <typename T>
 struct IsNan {
@@ -675,9 +667,7 @@ static __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto _internal_isinf(T v1)
     return cuda::std::isinf(static_cast<typename castType::value_type>(v1.real())) || cuda::std::isinf(static_cast<typename castType::value_type>(v1.imag()));
   } else {
     return cuda::std::isinf(static_cast<castType>(v1));
-  }
-
-  return false;  
+  } 
 }
 template <typename T>
 struct IsInf {