NVIDIA · wmaxey · Nov 5, 2021 · Oct 28, 2021 · Oct 28, 2021
diff --git a/include/cuda/std/barrier b/include/cuda/std/barrier
@@ -306,7 +306,7 @@ template<>
 struct __memcpy_async_impl<4, false> {
     __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
         for (std::size_t __offset = __rank * 4; __offset < __total_size; __offset += __stride * 4) {
-            asm volatile ("cp.async.ca.shared.global [%0], [%1], 4;"
+            asm volatile ("cp.async.ca.shared.global [%0], [%1], 4, 4;"
                 :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
                     "l"(__source + __offset)
                 : "memory");
@@ -319,7 +319,7 @@ template<>
 struct __memcpy_async_impl<8, false> {
     __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
         for (std::size_t __offset = __rank * 8; __offset < __total_size; __offset += __stride * 8) {
-            asm volatile ("cp.async.ca.shared.global [%0], [%1], 8;"
+            asm volatile ("cp.async.ca.shared.global [%0], [%1], 8, 8;"
                 :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
                     "l"(__source + __offset)
                 : "memory");
@@ -332,7 +332,7 @@ template<>
 struct __memcpy_async_impl<16, false> {
     __device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
         for (std::size_t __offset = __rank * 16; __offset < __total_size; __offset += __stride * 16) {
-            asm volatile ("cp.async.ca.shared.global [%0], [%1], 16;"
+            asm volatile ("cp.async.cg.shared.global [%0], [%1], 16, 16;"
                 :: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
                     "l"(__source + __offset)
                 : "memory");