Merge pull request #2403 from ROCm/6.1_build_fix

[ROCm] Don't use __floats2half2_rn in CPU code
ROCm · Feb 20, 2024 · e2b1dd1 · e2b1dd1
2 parents b48a88a + c7c19c5
commit e2b1dd1
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 5 deletions.
diff --git a/tensorflow/core/kernels/dropout_op_gpu.cu.cc b/tensorflow/core/kernels/dropout_op_gpu.cu.cc
@@ -189,18 +189,21 @@ void ApplyDropout<GPUDevice, T>::operator()(const GPUDevice& d, T* out, uint8* m
   uint64 num_blocks = (num_groups + kThreadInBlock - 1) / kThreadInBlock;
   // for FP32, it's optimal to run at 256x256
   if (std::is_same<T, float>::value && num_blocks > 256) num_blocks = 256;
-
+  // NOTE: A convoluted half2 initialization is done this way to circumvent implicit
+  // float to half conversion.
   if (do_half2) {
     TF_CHECK_OK(GpuLaunchKernel(
         RNGAndApplyDropoutKernel<half2, half2, half2>, num_blocks,
         kThreadInBlock, 0, d.stream(), gen, num_elements,
         reinterpret_cast<half2*>(out), mask, reinterpret_cast<const half2*>(in),
-        __floats2half2_rn(rate, rate), __floats2half2_rn(scale, scale)));
+        half2{half(Eigen::half(rate)), half(Eigen::half(rate))},
+        half2{half(Eigen::half(scale)), half(Eigen::half(scale))}));
   } else {
     TF_CHECK_OK(GpuLaunchKernel(
         RNGAndApplyDropoutKernel<T, half2, float2>, num_blocks, kThreadInBlock,
         0, d.stream(), gen, num_elements, out, mask, in,
-        __floats2half2_rn(rate, rate), make_float2(scale, scale)));
+        half2{half(Eigen::half(rate)), half(Eigen::half(rate))},
+        make_float2(scale, scale)));
   }
 }
 

diff --git a/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh b/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
@@ -63,7 +63,7 @@ if [ -f /usertools/rocm.bazelrc ]; then
              --config=sigbuild_local_cache \
              --config=rocm \
              --config=pycpp \
-	     --action_env=OPENBLAS_CORETYPE=Haswell \
+             --action_env=OPENBLAS_CORETYPE=Haswell \
              --action_env=TF_PYTHON_VERSION=$PYTHON_VERSION \
              --action_env=TF_ENABLE_ONEDNN_OPTS=0 \
              --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
@@ -86,7 +86,7 @@ else
 	      --test_env=HSA_TOOLS_LIB=libroctracer64.so \
 	      --test_env=TF_PYTHON_VERSION=$PYTHON_VERSION \
 	      --action_env=OPENBLAS_CORETYPE=Haswell \
-              --action_env=TF_ENABLE_ONEDNN_OPTS=0 \
+        --action_env=TF_ENABLE_ONEDNN_OPTS=0 \
 	      --test_timeout 920,2400,7200,9600 \
 	      --build_tests_only \
 	      --test_output=errors \