diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier index a2b0616d4c..69a8ecbbfb 100644 --- a/include/cuda/std/barrier +++ b/include/cuda/std/barrier @@ -306,7 +306,7 @@ template<> struct __memcpy_async_impl<4, false> { __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { for (std::size_t __offset = __rank * 4; __offset < __total_size; __offset += __stride * 4) { - asm volatile ("cp.async.ca.shared.global [%0], [%1], 4;" + asm volatile ("cp.async.ca.shared.global [%0], [%1], 4, 4;" :: "r"(static_cast(__cvta_generic_to_shared(__destination + __offset))), "l"(__source + __offset) : "memory"); @@ -319,7 +319,7 @@ template<> struct __memcpy_async_impl<8, false> { __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { for (std::size_t __offset = __rank * 8; __offset < __total_size; __offset += __stride * 8) { - asm volatile ("cp.async.ca.shared.global [%0], [%1], 8;" + asm volatile ("cp.async.ca.shared.global [%0], [%1], 8, 8;" :: "r"(static_cast(__cvta_generic_to_shared(__destination + __offset))), "l"(__source + __offset) : "memory"); @@ -332,7 +332,7 @@ template<> struct __memcpy_async_impl<16, false> { __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) { for (std::size_t __offset = __rank * 16; __offset < __total_size; __offset += __stride * 16) { - asm volatile ("cp.async.ca.shared.global [%0], [%1], 16;" + asm volatile ("cp.async.cg.shared.global [%0], [%1], 16, 16;" :: "r"(static_cast(__cvta_generic_to_shared(__destination + __offset))), "l"(__source + __offset) : "memory");