Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PTX: Add cuda::ptx:barrier_cluster_{arrive,wait} #1366

Merged
merged 2 commits into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 48 additions & 2 deletions libcudacxx/docs/ptx.md
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,7 @@ __device__ static inline void cp_async_bulk_wait_group_read(
|-----------------------|-------------------------|
| [`bar, barrier`] | No |
| [`bar.warp.sync`] | No |
| [`barrier.cluster`] | No |
| [`barrier.cluster`] | CTK-FUTURE, CCCL v2.4.0 |
| [`membar`] | No |
| [`fence`] | CTK-FUTURE, CCCL v2.4.0 |
| [`atom`] | No |
Expand All @@ -892,7 +892,7 @@ __device__ static inline void cp_async_bulk_wait_group_read(

[`bar, barrier`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier
[`bar.warp.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync
[`barrier.cluster`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
[`barrier.cluster`]: #barriercluster
[`membar`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
[`fence`]: #fence
[`atom`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
Expand All @@ -906,6 +906,52 @@ __device__ static inline void cp_async_bulk_wait_group_read(
[`griddepcontrol`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
[`elect.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync


#### `barrier.cluster`

- PTX ISA: [`barrier.cluster`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster)

Similar functionality is provided through the builtins
`__cluster_barrier_arrive(), __cluster_barrier_arrive_relaxed(),
__cluster_barrier_wait()`, as well as the `cooperative_groups::cluster_group`
[API](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cluster-group).

The `.aligned` variants of the instructions are not exposed.

**barrier_cluster**:
```cuda
// barrier.cluster.arrive; // PTX ISA 78, SM_90
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_arrive();

// barrier.cluster.wait; // PTX ISA 78, SM_90
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_wait();

// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
// .sem = { .release }
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::sem_release_t);

// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
// .sem = { .relaxed }
// Marked volatile
template <typename=void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::sem_relaxed_t);

// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
// .sem = { .acquire }
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_wait(
cuda::ptx::sem_acquire_t);
```

#### `fence`

- PTX ISA: [`fence`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence)
Expand Down
136 changes: 136 additions & 0 deletions libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
Original file line number Diff line number Diff line change
Expand Up @@ -2278,6 +2278,142 @@ _CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(

// 9.7.12.3. Parallel Synchronization and Communication Instructions: barrier.cluster
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-barrier-cluster
/*
// barrier.cluster.arrive; // PTX ISA 78, SM_90
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_arrive();
*/
#if __cccl_ptx_isa >= 780
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_arrive()
{
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.arrive;"
:
:
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 780

/*
// barrier.cluster.wait; // PTX ISA 78, SM_90
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_wait();
*/
#if __cccl_ptx_isa >= 780
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_wait()
{
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.wait;"
:
:
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 780

/*
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
// .sem = { .release }
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::sem_release_t);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_arrive(
sem_release_t)
{
// __sem == sem_release (due to parameter type constraint)
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.arrive.release;"
:
:
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 800

/*
// barrier.cluster.arrive.sem; // PTX ISA 80, SM_90
// .sem = { .relaxed }
// Marked volatile
template <typename=void>
__device__ static inline void barrier_cluster_arrive(
cuda::ptx::sem_relaxed_t);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_arrive(
sem_relaxed_t)
{
// __sem == sem_relaxed (due to parameter type constraint)
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.arrive.relaxed;"
:
:
:
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_arrive_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 800

/*
// barrier.cluster.wait.sem; // PTX ISA 80, SM_90
// .sem = { .acquire }
// Marked volatile and as clobbering memory
template <typename=void>
__device__ static inline void barrier_cluster_wait(
cuda::ptx::sem_acquire_t);
*/
#if __cccl_ptx_isa >= 800
extern "C" _CCCL_DEVICE void __cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
template <typename=void>
_CCCL_DEVICE static inline void barrier_cluster_wait(
sem_acquire_t)
{
// __sem == sem_acquire (due to parameter type constraint)
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm volatile (
"barrier.cluster.wait.acquire;"
:
:
: "memory"
);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_barrier_cluster_wait_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 800

// 9.7.12.4. Parallel Synchronization and Communication Instructions: membar/fence
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads

// <cuda/ptx>

#include <cuda/ptx>
#include <cuda/std/utility>

/*
* We use a special strategy to force the generation of the PTX. This is mainly
* a fight against dead-code-elimination in the NVVM layer.
*
* The reason we need this strategy is because certain older versions of ptxas
* segfault when a non-sensical sequence of PTX is generated. So instead, we try
* to force the instantiation and compilation to PTX of all the overloads of the
* PTX wrapping functions.
*
* We do this by writing a function pointer of each overload to the kernel
* parameter `fn_ptr`.
*
* Because `fn_ptr` is possibly visible outside this translation unit, the
* compiler must compile all the functions which are stored.
*
*/

__global__ void test_barrier_cluster(void ** fn_ptr) {
#if __cccl_ptx_isa >= 780
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.arrive;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_arrive));
));
#endif // __cccl_ptx_isa >= 780

#if __cccl_ptx_isa >= 780
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.wait;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)()>(cuda::ptx::barrier_cluster_wait));
));
#endif // __cccl_ptx_isa >= 780

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.arrive.release;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t)>(cuda::ptx::barrier_cluster_arrive));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.arrive.relaxed;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_relaxed_t)>(cuda::ptx::barrier_cluster_arrive));
));
#endif // __cccl_ptx_isa >= 800

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// barrier.cluster.wait.acquire;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_acquire_t)>(cuda::ptx::barrier_cluster_wait));
));
#endif // __cccl_ptx_isa >= 800
}

int main(int, char**)
{
return 0;
}
Loading