Skip to content

Commit

Permalink
Merge pull request #2403 from ROCm/6.1_build_fix
Browse files Browse the repository at this point in the history
[ROCm] Don't use __floats2half2_rn in CPU code
  • Loading branch information
draganmladjenovic authored Feb 20, 2024
2 parents b48a88a + c7c19c5 commit e2b1dd1
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
9 changes: 6 additions & 3 deletions tensorflow/core/kernels/dropout_op_gpu.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -189,18 +189,21 @@ void ApplyDropout<GPUDevice, T>::operator()(const GPUDevice& d, T* out, uint8* m
uint64 num_blocks = (num_groups + kThreadInBlock - 1) / kThreadInBlock;
// for FP32, it's optimal to run at 256x256
if (std::is_same<T, float>::value && num_blocks > 256) num_blocks = 256;

// NOTE: A convoluted half2 initialization is done this way to circumvent implicit
// float to half conversion.
if (do_half2) {
TF_CHECK_OK(GpuLaunchKernel(
RNGAndApplyDropoutKernel<half2, half2, half2>, num_blocks,
kThreadInBlock, 0, d.stream(), gen, num_elements,
reinterpret_cast<half2*>(out), mask, reinterpret_cast<const half2*>(in),
__floats2half2_rn(rate, rate), __floats2half2_rn(scale, scale)));
half2{half(Eigen::half(rate)), half(Eigen::half(rate))},
half2{half(Eigen::half(scale)), half(Eigen::half(scale))}));
} else {
TF_CHECK_OK(GpuLaunchKernel(
RNGAndApplyDropoutKernel<T, half2, float2>, num_blocks, kThreadInBlock,
0, d.stream(), gen, num_elements, out, mask, in,
__floats2half2_rn(rate, rate), make_float2(scale, scale)));
half2{half(Eigen::half(rate)), half(Eigen::half(rate))},
make_float2(scale, scale)));
}
}

Expand Down
4 changes: 2 additions & 2 deletions tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ if [ -f /usertools/rocm.bazelrc ]; then
--config=sigbuild_local_cache \
--config=rocm \
--config=pycpp \
--action_env=OPENBLAS_CORETYPE=Haswell \
--action_env=OPENBLAS_CORETYPE=Haswell \
--action_env=TF_PYTHON_VERSION=$PYTHON_VERSION \
--action_env=TF_ENABLE_ONEDNN_OPTS=0 \
--test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
Expand All @@ -86,7 +86,7 @@ else
--test_env=HSA_TOOLS_LIB=libroctracer64.so \
--test_env=TF_PYTHON_VERSION=$PYTHON_VERSION \
--action_env=OPENBLAS_CORETYPE=Haswell \
--action_env=TF_ENABLE_ONEDNN_OPTS=0 \
--action_env=TF_ENABLE_ONEDNN_OPTS=0 \
--test_timeout 920,2400,7200,9600 \
--build_tests_only \
--test_output=errors \
Expand Down

0 comments on commit e2b1dd1

Please sign in to comment.