Fix ordering of alignas and __shared__ (#1601)

alignas must precede `__shared__` Fixes Misplaced alignas() in cuda/memcpy_async/memcpy_async_tx.pass.cpp #1599
NVIDIA · Apr 8, 2024 · eb814a7 · eb814a7
1 parent 14a2984
commit eb814a7
Show file tree

Hide file tree

Showing 6 changed files with 10 additions and 10 deletions.
diff --git a/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md b/libcudacxx/docs/extended_api/asynchronous_operations/memcpy_async_tx.md
@@ -32,7 +32,7 @@ Copies `size` bytes from global memory `src` to shared memory `dest` and decreme
 ## Requires
 
 * `is_trivially_copyable_v<T>` is true.
- 
+
 ## Notes
 
 This function can only be used under CUDA Compute Capability 9.0 (Hopper) or
@@ -60,7 +60,7 @@ static_assert(false, "Insufficient CUDA Compute Capability: cuda::device::memcpy
 __device__ alignas(16) int gmem_x[2048];
 
 __global__ void example_kernel() {
-  __shared__ alignas(16) int smem_x[1024];
+  alignas(16) __shared__ int smem_x[1024];
   __shared__ cuda::barrier<cuda::thread_scope_block> bar;
   if (threadIdx.x == 0) {
     init(&bar, blockDim.x);
@@ -73,7 +73,7 @@ __global__ void example_kernel() {
     token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_x));
   } else {
     token = bar.arrive(1);
-  } 
+  }
   bar.wait(cuda::std::move(token));
 
   // smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]

diff --git a/...dacxx/docs/extended_api/synchronization_primitives/barrier/barrier_expect_tx.md b/...dacxx/docs/extended_api/synchronization_primitives/barrier/barrier_expect_tx.md
@@ -45,7 +45,7 @@ __device__ alignas(16) int gmem_x[2048];
 
 __global__ void example_kernel() {
     using barrier_t = cuda::barrier<cuda::thread_scope_block>;
-  __shared__ alignas(16) int smem_x[1024];
+  alignas(16) __shared__ int smem_x[1024];
   __shared__ barrier_t bar;
 
   if (threadIdx.x == 0) {
@@ -56,9 +56,9 @@ __global__ void example_kernel() {
   if (threadIdx.x == 0) {
     cuda::device::memcpy_async_tx(smem_x, gmem_x, cuda::aligned_size_t<16>(sizeof(smem_x)), bar);
     cuda::device::barrier_expect_tx(bar, sizeof(smem_x));
-  } 
+  }
   auto token = bar.arrive(1);
-  
+
   bar.wait(cuda::std::move(token));
 
   // smem_x contains the contents of gmem_x[0], ..., gmem_x[1023]

diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk.pass.cpp
@@ -39,7 +39,7 @@ __device__ void test()
     __syncthreads();
 
     // TEST: Add i to buffer[i]
-    __shared__ alignas(16) int smem_buffer[buf_len];
+    alignas(16) __shared__ int smem_buffer[buf_len];
     __shared__ barrier bar;
     if (threadIdx.x == 0) { init(&bar, blockDim.x); }
     __syncthreads();

diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor.pass.cpp
@@ -64,7 +64,7 @@ __device__ void test(int base_i, int base_j)
     __syncthreads();
 
     // TEST: Add i to buffer[i]
-    __shared__ alignas(128) int smem_buffer[buf_len];
+    alignas(128) __shared__ int smem_buffer[buf_len];
     __shared__ barrier bar;
     if (threadIdx.x == 0) { init(&bar, blockDim.x); }
     __syncthreads();

diff --git a/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h b/libcudacxx/test/libcudacxx/cuda/barrier/cp_async_bulk_tensor_generic.h
@@ -149,7 +149,7 @@ __device__ void test(cuda::std::array<uint32_t, num_dims> smem_coord,
     __syncthreads();
 
     // TEST: Add i to buffer[i]
-    __shared__ alignas(128) int smem_buffer[smem_len];
+    alignas(128) __shared__ int smem_buffer[smem_len];
     __shared__ barrier bar;
     if (threadIdx.x == 0) { init(&bar, blockDim.x); }
     __syncthreads();

diff --git a/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp b/libcudacxx/test/libcudacxx/cuda/memcpy_async/memcpy_async_tx.pass.cpp
@@ -39,7 +39,7 @@ int main(int, char**)
     NV_DISPATCH_TARGET(
         NV_IS_DEVICE, (
             using barrier_t = cuda::barrier<cuda::thread_scope_block>;
-            __shared__ alignas(16) int smem_x[1024];
+            alignas(16) __shared__ int smem_x[1024];
 
             shared_memory_selector<barrier_t, constructor_initializer> sel;
             barrier_t* b = sel.construct(blockDim.x);