Skip to content

Commit

Permalink
PTX: Add cuda::ptx:barrier_cluster_{arrive,wait} (#1366)
Browse files Browse the repository at this point in the history
* Add `cuda::ptx::barrier.cluster.{arrive,wait}`

* Add note about .aligned variants
  • Loading branch information
ahendriksen authored Feb 27, 2024
1 parent cbf7da9 commit 2f09e3d
Show file tree
Hide file tree
Showing 3 changed files with 258 additions and 2 deletions.
50 changes: 48 additions & 2 deletions libcudacxx/docs/ptx.md
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,7 @@ __device__ static inline void cp_async_bulk_wait_group_read(
|-----------------------|-------------------------|
| [`bar, barrier`] | No |
| [`bar.warp.sync`] | No |
| [`barrier.cluster`] | No |
| [`barrier.cluster`] | CTK-FUTURE, CCCL v2.4.0 |
| [`membar`] | No |
| [`fence`] | CTK-FUTURE, CCCL v2.4.0 |
| [`atom`] | No |
Expand All @@ -892,7 +892,7 @@ __device__ static inline void cp_async_bulk_wait_group_read(

[`bar, barrier`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier
[`bar.warp.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync
[`barrier.cluster`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
[`barrier.cluster`]: #barriercluster
[`membar`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
[`fence`]: #fence
[`atom`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
Expand All @@ -906,6 +906,52 @@ __device__ static inline void cp_async_bulk_wait_group_read(
[`griddepcontrol`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
[`elect.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync


#### `barrier.cluster`

- PTX ISA: [`barrier.cluster`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)

Similar functionality is provided through the builtins
`__cluster_barrier_arrive(), __cluster_barrier_arrive_relaxed(),
__cluster_barrier_wait()`, as well as the `cooperative_groups::cluster_group`
[API](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cluster-group).

The `.aligned` variants of the instructions are not exposed.

**barrier_cluster**:
```cuda
// barrier.cluster.arrive; // PTX ISA 78, SM_90
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_arrive();
// barrier.cluster.wait; // PTX ISA 78, SM_90
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_wait();
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
// .sem = { .release }
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::sem_release_t);
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
// .sem = { .relaxed }
// Marked volatile
template <typename=void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::sem_relaxed_t);
// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
// .sem = { .acquire }
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_wait(
cuda::ptx::sem_acquire_t);
```

#### `fence`

- PTX ISA: [`fence`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence)
Expand Down
136 changes: 136 additions & 0 deletions libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
Original file line number Diff line number Diff line change
Expand Up @@ -2278,6 +2278,142 @@ _CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(

// 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
/*
// barrier.cluster.arrive; // PTX ISA 78, SM_90
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_arrive();
*/
#if __cccl_ptx_isa >= 780
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_arrive()
{
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.arrive;"
:
:
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 780

/*
// barrier.cluster.wait; // PTX ISA 78, SM_90
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_wait();
*/
#if __cccl_ptx_isa >= 780
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_wait()
{
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.wait;"
:
:
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 780

/*
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
// .sem = { .release }
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::sem_release_t);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_arrive(
sem_release_t)
{
// __sem == sem_release (due to parameter type constraint)
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.arrive.release;"
:
:
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 800

/*
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
// .sem = { .relaxed }
// Marked volatile
template <typename=void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::sem_relaxed_t);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_arrive(
sem_relaxed_t)
{
// __sem == sem_relaxed (due to parameter type constraint)
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.arrive.relaxed;"
:
:
:
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 800

/*
// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
// .sem = { .acquire }
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_wait(
cuda::ptx::sem_acquire_t);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_wait(
sem_acquire_t)
{
// __sem == sem_acquire (due to parameter type constraint)
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.wait.acquire;"
:
:
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 800

// 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads

// <cuda/ptx>

#include <cuda/ptx>
#include <cuda/std/utility>

/*
* We use a special strategy to force the generation of the PTX. This is mainly
* a fight against dead-code-elimination in the NVVM layer.
*
* The reason we need this strategy is because certain older versions of ptxas
* segfault when a non-sensical sequence of PTX is generated. So instead, we try
* to force the instantiation and compilation to PTX of all the overloads of the
* PTX wrapping functions.
*
* We do this by writing a function pointer of each overload to the kernel
* parameter `fn_ptr`.
*
* Because `fn_ptr` is possibly visible outside this translation unit, the
* compiler must compile all the functions which are stored.
*
*/

__global__ void test_barrier_cluster(void ** fn_ptr) {
#if __cccl_ptx_isa >= 780
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.arrive;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));
));
#endif // __cccl_ptx_isa >= 780

#if __cccl_ptx_isa >= 780
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.wait;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));
));
#endif // __cccl_ptx_isa >= 780

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.arrive.release;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.arrive.relaxed;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.wait.acquire;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));
));
#endif // __cccl_ptx_isa >= 800
}

int main(int, char**)
{
return 0;
}

0 comments on commit 2f09e3d

Please sign in to comment.