Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

memcpy_async should cache only in L2 when possible #220

Merged
merged 2 commits into from
Nov 5, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions include/cuda/std/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ template<>
struct __memcpy_async_impl<4, false> {
__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
for (std::size_t __offset = __rank * 4; __offset < __total_size; __offset += __stride * 4) {
asm volatile ("cp.async.ca.shared.global [%0], [%1], 4;"
asm volatile ("cp.async.ca.shared.global [%0], [%1], 4, 4;"
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
"l"(__source + __offset)
: "memory");
Expand All @@ -319,7 +319,7 @@ template<>
struct __memcpy_async_impl<8, false> {
__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
for (std::size_t __offset = __rank * 8; __offset < __total_size; __offset += __stride * 8) {
asm volatile ("cp.async.ca.shared.global [%0], [%1], 8;"
asm volatile ("cp.async.ca.shared.global [%0], [%1], 8, 8;"
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
"l"(__source + __offset)
: "memory");
Expand All @@ -332,7 +332,7 @@ template<>
struct __memcpy_async_impl<16, false> {
__device__ static inline bool __copy(char * __destination, char const * __source, std::size_t __total_size, std::size_t __rank, std::size_t __stride) {
for (std::size_t __offset = __rank * 16; __offset < __total_size; __offset += __stride * 16) {
asm volatile ("cp.async.ca.shared.global [%0], [%1], 16;"
asm volatile ("cp.async.cg.shared.global [%0], [%1], 16, 16;"
:: "r"(static_cast<std::uint32_t>(__cvta_generic_to_shared(__destination + __offset))),
"l"(__source + __offset)
: "memory");
Expand Down