Skip to content

Commit

Permalink
Fix ordering of alignas and __shared__ (#1601)
Browse files Browse the repository at this point in the history
alignas must precede `__shared__`

Fixes Misplaced alignas() in cuda/memcpy_async/memcpy_async_tx.pass.cpp #1599
  • Loading branch information
miscco authored Apr 8, 2024
1 parent 14a2984 commit eb814a7
Show file tree
Hide file tree
Showing 6 changed files with 10 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Copies `size` bytes from global memory `src` to shared memory `dest` and decreme
## Requires

* `is_trivially_copyable_v<T>` is true.

## Notes

This function can only be used under CUDA Compute Capability 9.0 (Hopper) or
Expand Down Expand Up @@ -60,7 +60,7 @@ static_assert(false, "Insufficient CUDA Compute Capability: cuda::device::memcpy
__device__ alignas(16) int gmem_x[2048];
__global__ void example_kernel() {
__shared__ alignas(16) int smem_x[1024];
alignas(16) __shared__ int smem_x[1024];
__shared__ cuda::barrier<cuda::thread_scope_block> bar;
if (threadIdx.x == 0) {
init(&bar, blockDim.x);
Expand All @@ -73,7 +73,7 @@ __global__ void example_kernel() {
token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_x));
} else {
token = bar.arrive(1);
}
}
bar.wait(cuda::std::move(token));
// smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ __device__ alignas(16) int gmem_x[2048];
__global__ void example_kernel() {
using barrier_t = cuda::barrier<cuda::thread_scope_block>;
__shared__ alignas(16) int smem_x[1024];
alignas(16) __shared__ int smem_x[1024];
__shared__ barrier_t bar;
if (threadIdx.x == 0) {
Expand All @@ -56,9 +56,9 @@ __global__ void example_kernel() {
if (threadIdx.x == 0) {
cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar);
cuda::device::barrier_expect_tx(bar, sizeof(smem_x));
}
}
auto token = bar.arrive(1);
bar.wait(cuda::std::move(token));
// smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ __device__ void test()
__syncthreads();

// TEST: Add i to buffer[i]
__shared__ alignas(16) int smem_buffer[buf_len];
alignas(16) __shared__ int smem_buffer[buf_len];
__shared__ barrier bar;
if (threadIdx.x == 0) { init(&bar, blockDim.x); }
__syncthreads();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ __device__ void test(int base_i, int base_j)
__syncthreads();

// TEST: Add i to buffer[i]
__shared__ alignas(128) int smem_buffer[buf_len];
alignas(128) __shared__ int smem_buffer[buf_len];
__shared__ barrier bar;
if (threadIdx.x == 0) { init(&bar, blockDim.x); }
__syncthreads();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ __device__ void test(cuda::std::array<uint32_t, num_dims> smem_coord,
__syncthreads();

// TEST: Add i to buffer[i]
__shared__ alignas(128) int smem_buffer[smem_len];
alignas(128) __shared__ int smem_buffer[smem_len];
__shared__ barrier bar;
if (threadIdx.x == 0) { init(&bar, blockDim.x); }
__syncthreads();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ int main(int, char**)
NV_DISPATCH_TARGET(
NV_IS_DEVICE, (
using barrier_t = cuda::barrier<cuda::thread_scope_block>;
__shared__ alignas(16) int smem_x[1024];
alignas(16) __shared__ int smem_x[1024];

shared_memory_selector<barrier_t, constructor_initializer> sel;
barrier_t* b = sel.construct(blockDim.x);
Expand Down

0 comments on commit eb814a7

Please sign in to comment.