Skip to content

Commit

Permalink
Fix atomic performance regressions by avoiding use of memcpy with nat…
Browse files Browse the repository at this point in the history
…ively supported atomic types. (#1801)

* Add atomic derived overloads that capture non-scalar types and converts them to u32/64.

* Update codegen to use native types for .cas and .exch.

* Use `_CCCL_TRAIT` for SFINAE.

* Fix _CCCL_TRAIT usage

* Maybe safe everything

* [pre-commit.ci] auto code formatting

* Improve codegen tests for atomic.
* Switch to PTX.
* Tightly check all output.

* Remove casts of atomic pointers used in PTX.

* [pre-commit.ci] auto code formatting

---------

Co-authored-by: Michael Schellenberger Costa <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Jun 8, 2024
1 parent 998e020 commit e734d68
Show file tree
Hide file tree
Showing 11 changed files with 2,129 additions and 1,717 deletions.
128 changes: 56 additions & 72 deletions libcudacxx/codegen/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,15 @@ int main()
{"volatile", ""}};
std::vector<std::string> rmw_classes{"bitwise", "arithmetic"};
std::map<std::string, std::map<std::string, std::string>> rmw_operations{
{"bitwise",
{"bitwise", std::map<std::string, std::string>{{"fetch_and", ".and"}, {"fetch_or", ".or"}, {"fetch_xor", ".xor"}}},
{"arithmetic",
std::map<std::string, std::string>{
{"exchange", ".exch"},
{"compare_exchange", ".cas"},
{"fetch_and", ".and"},
{"fetch_or", ".or"},
{"fetch_xor", ".xor"}}},
{"arithmetic",
std::map<std::string, std::string>{
{"fetch_add", ".add"}, {"fetch_sub", ".add"}, {"fetch_max", ".max"}, {"fetch_min", ".min"}}}};
{"fetch_add", ".add"},
{"fetch_sub", ".add"},
{"fetch_max", ".max"},
{"fetch_min", ".min"}}}};
std::map<std::string, std::map<std::string, std::string>> rmw_types{
{"bitwise", std::map<std::string, std::string>{{"", ".b"}}},
{"arithmetic", std::map<std::string, std::string>{{"u", ".u"}, {"s", ".s"}, {"f", ".f"}}}};
Expand Down Expand Up @@ -273,7 +272,9 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
{
continue;
}
if (type.first == "s" && (rmw.first == "fetch_add" || rmw.first == "fetch_sub"))
if (type.first == "s"
&& (rmw.first == "fetch_add" || rmw.first == "fetch_sub" || rmw.first == "compare_exchange"
|| rmw.first == "exchange"))
{
continue;
}
Expand Down Expand Up @@ -302,13 +303,19 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
{
out << "__op = -__op;" << std::endl;
}
out << "asm volatile(\"atom" << rmw.second << sem.second << s.second << type.second << sz << " ";
if (rmw.first == "compare_exchange")
{
out << "asm volatile(\"atom" << rmw.second << sem.second << s.second << ".b" << sz << " ";
out << "%0,[%1],%2,%3";
}
else if (rmw.first == "exchange")
{
out << "asm volatile(\"atom" << rmw.second << sem.second << s.second << ".b" << sz << " ";
out << "%0,[%1],%2";
}
else
{
out << "asm volatile(\"atom" << rmw.second << sem.second << s.second << type.second << sz << " ";
out << "%0,[%1],%2";
}
out << ";\" : ";
Expand All @@ -326,30 +333,52 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
}
for (auto& cv : cv_qualifier)
{
out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz / 8;
if (type.first == "f")
{
out << " && _CCCL_TRAIT(is_floating_point, _Type), int> = 0>\n";
}
else if (rmw.first == "fetch_max" || rmw.first == "fetch_min")
{
if (type.first == "u")
{
out << " && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_unsigned, _Type), int> "
"= 0>\n";
}
else if (type.first == "s")
{
out << " && _CCCL_TRAIT(is_integral, _Type) && _CCCL_TRAIT(is_signed, _Type), int> = "
"0>\n";
}
}
else if (type.first == "u")
{
out << " && (_CCCL_TRAIT(is_integral, _Type) || _CCCL_TRAIT(is_pointer, _Type)), int> = 0>\n";
}
else
{
out << ", int> = 0>\n";
}
if (rmw.first == "compare_exchange")
{
out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz / 8 << ", int> = 0>\n";
out << "_CCCL_DEVICE bool __atomic_compare_exchange_cuda(" << cv
<< "_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int "
<< "void *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int "
"__failure_memorder, "
<< scopenametag(s.first) << ") {\n";
out << " uint" << sz << "_t __tmp = 0, __old = 0, __old_tmp;\n";
out << " memcpy(&__tmp, &__desired, " << sz / 8 << ");\n";
out << " memcpy(&__old, __expected, " << sz / 8 << ");\n";
out << " __old_tmp = __old;\n";
out << " auto __old = *__expected;\n";
out << " NV_DISPATCH_TARGET(\n";
out << " NV_PROVIDES_SM_70, (\n";
out << " switch (__stronger_order_cuda(__success_memorder, __failure_memorder)) {\n";
out << " case __ATOMIC_SEQ_CST: " << fencename("sc"s, s.first) << "(); _CCCL_FALLTHROUGH();\n";
out << " case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n";
out << " case __ATOMIC_ACQUIRE: __cuda_compare_exchange_acquire_" << type.first << sz << "_"
<< s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n";
<< s.first << "(__ptr, *__expected, __old, __desired); break;\n";
out << " case __ATOMIC_ACQ_REL: __cuda_compare_exchange_acq_rel_" << type.first << sz << "_"
<< s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n";
<< s.first << "(__ptr, *__expected, __old, __desired); break;\n";
out << " case __ATOMIC_RELEASE: __cuda_compare_exchange_release_" << type.first << sz << "_"
<< s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n";
<< s.first << "(__ptr, *__expected, __old, __desired); break;\n";
out << " case __ATOMIC_RELAXED: __cuda_compare_exchange_relaxed_" << type.first << sz << "_"
<< s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n";
<< s.first << "(__ptr, *__expected, __old, __desired); break;\n";
out << " default: assert(0);\n";
out << " }\n";
out << " ),\n";
Expand All @@ -359,77 +388,33 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
out << " case __ATOMIC_ACQ_REL: __cuda_membar_" << s.first << "(); _CCCL_FALLTHROUGH();\n";
out << " case __ATOMIC_CONSUME: _CCCL_FALLTHROUGH();\n";
out << " case __ATOMIC_ACQUIRE: __cuda_compare_exchange_volatile_" << type.first << sz << "_"
<< s.first << "(__ptr, __old, __old_tmp, __tmp); __cuda_membar_" << s.first << "(); break;\n";
<< s.first << "(__ptr, *__expected, __old, __desired); __cuda_membar_" << s.first << "(); break;\n";
out << " case __ATOMIC_RELEASE: __cuda_membar_" << s.first
<< "(); __cuda_compare_exchange_volatile_" << type.first << sz << "_" << s.first
<< "(__ptr, __old, __old_tmp, __tmp); break;\n";
<< "(__ptr, *__expected, __old, __desired); break;\n";
out << " case __ATOMIC_RELAXED: __cuda_compare_exchange_volatile_" << type.first << sz << "_"
<< s.first << "(__ptr, __old, __old_tmp, __tmp); break;\n";
<< s.first << "(__ptr, *__expected, __old, __desired); break;\n";
out << " default: assert(0);\n";
out << " }\n";
out << " )\n";
out << " )\n";
out << " bool const __ret = __old == __old_tmp;\n";
out << " memcpy(__expected, &__old, " << sz / 8 << ");\n";
out << " return __ret;\n";
out << " return (__old == *__expected);\n";
out << "}\n";
}
else
{
out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz / 8;
if (rmw.first == "exchange")
{
out << ", int> = 0>\n";
out
<< "_CCCL_DEVICE void __atomic_exchange_cuda(" << cv
<< "_Type *__ptr, _Type *__val, _Type *__ret, int __memorder, " << scopenametag(s.first) << ") {\n";
out << " uint" << sz << "_t __tmp = 0;\n";
out << " memcpy(&__tmp, __val, " << sz / 8 << ");\n";
<< "void *__ptr, _Type *__val, _Type *__ret, int __memorder, " << scopenametag(s.first) << ") {\n";
out << " _Type __tmp = *__val;\n";
}
else
{
if (type.first == "f")
{
out << " && _CUDA_VSTD::is_floating_point<_Type>::value, int> = 0>\n";
}
else if (rmw.first == "fetch_max" || rmw.first == "fetch_min")
{
if (type.first == "u")
{
out << " && _CUDA_VSTD::is_integral<_Type>::value && _CUDA_VSTD::is_unsigned<_Type>::value, int> "
"= 0>\n";
}
else if (type.first == "s")
{
out << " && _CUDA_VSTD::is_integral<_Type>::value && _CUDA_VSTD::is_signed<_Type>::value, int> = "
"0>\n";
}
}
else if (type.first == "u")
{
out << " && _CUDA_VSTD::is_integral<_Type>::value, int> = 0>\n";
}
else
{
out << ", int> = 0>\n";
}
out << "_CCCL_DEVICE _Type __atomic_" << rmw.first << "_cuda(" << cv
<< "_Type *__ptr, _Type __val, int __memorder, " << scopenametag(s.first) << ") {\n";
out << " _Type __ret;\n";
if (type.first == "f" && sz == 32)
{
out << " float";
}
else if (type.first == "f" && sz == 64)
{
out << " double";
}
else
{
out << " uint" << sz << "_t";
}
out << " __tmp = 0;\n";
out << " memcpy(&__tmp, &__val, " << sz / 8 << ");\n";
out << " _Type __tmp = __val;\n";
}
out << " NV_DISPATCH_TARGET(\n";
out << " NV_PROVIDES_SM_70, (\n";
Expand Down Expand Up @@ -468,8 +453,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
}
else
{
out << " memcpy(&__ret, &__tmp, " << sz / 8 << ");\n";
out << " return __ret;\n";
out << " return __tmp;\n";
}
out << "}\n";
}
Expand Down
66 changes: 66 additions & 0 deletions libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_derived.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,78 @@
#endif // no system header

#include <cuda/std/__atomic/functions/cuda_ptx_generated.h>
#include <cuda/std/__type_traits/conditional.h>
#include <cuda/std/__type_traits/enable_if.h>
#include <cuda/std/__type_traits/is_scalar.h>
#include <cuda/std/cstdint>

_LIBCUDACXX_BEGIN_NAMESPACE_STD

#if defined(_CCCL_CUDA_COMPILER)

template <typename _Tp,
typename _Sco,
__enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
_CCCL_DEVICE bool __atomic_compare_exchange_cuda(
void volatile* __ptr,
_Tp* __expected,
const _Tp __desired,
bool __weak,
int __success_memorder,
int __failure_memorder,
_Sco)
{
using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
__proxy_t __old = 0;
__proxy_t __new = 0;
memcpy(&__old, __expected, sizeof(__proxy_t));
memcpy(&__new, &__desired, sizeof(__proxy_t));
bool __result =
__atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{});
memcpy(__expected, &__old, sizeof(__proxy_t));
return __result;
}
template <typename _Tp,
typename _Sco,
__enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
_CCCL_DEVICE bool __atomic_compare_exchange_cuda(
void* __ptr, _Tp* __expected, const _Tp __desired, bool __weak, int __success_memorder, int __failure_memorder, _Sco)
{
using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
__proxy_t __old = 0;
__proxy_t __new = 0;
memcpy(&__old, __expected, sizeof(__proxy_t));
memcpy(&__new, &__desired, sizeof(__proxy_t));
bool __result =
__atomic_compare_exchange_cuda(__ptr, &__old, __new, __weak, __success_memorder, __failure_memorder, _Sco{});
memcpy(__expected, &__old, sizeof(__proxy_t));
return __result;
}
template <typename _Tp,
typename _Sco,
__enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
_CCCL_DEVICE void __atomic_exchange_cuda(void volatile* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
{
using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
__proxy_t __old = 0;
__proxy_t __new = 0;
memcpy(&__new, __val, sizeof(__proxy_t));
__atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{});
memcpy(__ret, &__old, sizeof(__proxy_t));
}
template <typename _Tp,
typename _Sco,
__enable_if_t<!is_scalar<_Tp>::value && (sizeof(_Tp) == 4 || sizeof(_Tp) == 8), int> = 0>
_CCCL_DEVICE void __atomic_exchange_cuda(void* __ptr, _Tp* __val, _Tp* __ret, int __memorder, _Sco)
{
using __proxy_t = _If<sizeof(_Tp) == 4, uint32_t, uint64_t>;
__proxy_t __old = 0;
__proxy_t __new = 0;
memcpy(&__new, __val, sizeof(__proxy_t));
__atomic_exchange_cuda(__ptr, &__new, &__old, __memorder, _Sco{});
memcpy(__ret, &__old, sizeof(__proxy_t));
}

template <typename _Tp, typename _Sco, __enable_if_t<sizeof(_Tp) <= 2, int> = 0>
_CCCL_DEVICE bool __atomic_compare_exchange_cuda(
_Tp volatile* __ptr, _Tp* __expected, const _Tp __desired, bool, int __success_memorder, int __failure_memorder, _Sco)
Expand Down
Loading

0 comments on commit e734d68

Please sign in to comment.