diff --git a/libcudacxx/include/cuda/annotated_ptr b/libcudacxx/include/cuda/annotated_ptr
index bd9f26ad591..f5e04e56623 100644
--- a/libcudacxx/include/cuda/annotated_ptr
+++ b/libcudacxx/include/cuda/annotated_ptr
@@ -3,50 +3,128 @@
  *
  * NVIDIA SOFTWARE LICENSE
  *
- * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
+ * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the
+ * NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
  *
- * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users.
+ * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used.
+ * If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By
+ * taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of
+ * this license, and you take legal and financial responsibility for the actions of your permitted users.
  *
- * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions.
+ * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law,
+ * regulation or generally accepted practices or guidelines in the relevant jurisdictions.
  *
- * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license.
+ * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install
+ * and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this
+ * license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under
+ * this license.
  *
  * 2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant:
- * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights.
- * b.      You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE.
+ * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license,
+ * including (without limitation) terms relating to the license grant and license restrictions and protection of
+ * NVIDIA’s intellectual property rights. b.      You agree to notify NVIDIA in writing of any known or suspected
+ * distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms
+ * of your agreements with respect to distributed SOFTWARE.
  *
  * 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows:
  * a.      The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs.
- * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE.
- * c.      You may not modify or create derivative works of any portion of the SOFTWARE.
- * d.      You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE.
- * e.      You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge.
- * f.      Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses.
- * g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
+ * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from
+ * any portion of the SOFTWARE or copies of the SOFTWARE. c.      You may not modify or create derivative works of any
+ * portion of the SOFTWARE. d.      You may not bypass, disable, or circumvent any technical measure, encryption,
+ * security, digital rights management or authentication mechanism in the SOFTWARE. e.      You may not use the SOFTWARE
+ * in any manner that would cause it to become subject to an open source software license. As examples, licenses that
+ * require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in
+ * source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. f.
+ * Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or
+ * application where the use or failure of the system or application can reasonably be expected to threaten or result in
+ * personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life
+ * support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these
+ * critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or
+ * damages arising from such uses. g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates,
+ * and their respective employees, contractors, agents, officers and directors, from and against any and all claims,
+ * damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited
+ * to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use
+ * of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
  *
- * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems.
+ * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may
+ * not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy,
+ * availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use
+ * a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in
+ * production or business-critical systems.
  *
- * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE.
+ * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and
+ * exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United
+ * States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time
+ * without notice, but is not obligated to support or update the SOFTWARE.
  *
- * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict.
+ * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal
+ * notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is
+ * a conflict between the terms in this license and the license terms associated with a component, the license terms
+ * associated with the components control only to the extent necessary to resolve the conflict.
  *
- * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice.
+ * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes,
+ * enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you
+ * voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable
+ * license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute
+ * (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA
+ * will use Feedback at its choice.
  *
- * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
+ * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT
+ * NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT
+ * WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR
+ * ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
  *
- * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT.
+ * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE
+ * FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE,
+ * LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH
+ * THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON
+ * BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION
+ * OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE
+ * POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING
+ * OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE
+ * OR EXTEND THIS LIMIT.
  *
- * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you.
+ * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail
+ * to comply with any term and condition of this license or if you commence or participate in any legal proceeding
+ * against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if
+ * NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of
+ * it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of
+ * the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this
+ * license are not affected by the termination of this license. All provisions of this license will survive termination,
+ * except for the license granted to you.
  *
- * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+ * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State
+ * of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware
+ * residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the
+ * International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English
+ * language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction
+ * over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be
+ * allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
  *
- * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect.
+ * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or
+ * operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be
+ * void and of no effect.
  *
- * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE.
+ * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship,
+ * transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States
+ * Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s
+ * Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws
+ * include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not
+ * a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from
+ * receiving the SOFTWARE.
  *
- * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
+ * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting
+ * of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS.
+ * Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the
+ * restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the
+ * Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is
+ * NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
  *
- * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party.
+ * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the
+ * subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to
+ * this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of
+ * this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect.
+ * This license may only be modified in a writing signed by an authorized representative of each party.
  *
  * (v. August 20, 2021)
  */
@@ -71,56 +149,96 @@
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-class access_property {
-  private:
-    std::uint64_t __descriptor = 0;
-
-  public:
-    struct shared {};
-    struct global {};
-    struct persisting {
-      _CCCL_HOST_DEVICE constexpr operator cudaAccessProperty() const noexcept {
-        return cudaAccessProperty::cudaAccessPropertyPersisting;
-      }
-    };
-    struct streaming {
-      _CCCL_HOST_DEVICE constexpr operator cudaAccessProperty() const noexcept {
-        return cudaAccessProperty::cudaAccessPropertyStreaming;
-      }
-    };
-    struct normal {
-      _CCCL_HOST_DEVICE constexpr operator cudaAccessProperty() const noexcept {
-        return cudaAccessProperty::cudaAccessPropertyNormal;
-      }
-    };
-
-    _CCCL_HOST_DEVICE constexpr access_property(global) noexcept : __descriptor(__detail_ap::__sm_80::__interleave_normal()) {}
-    _CCCL_HOST_DEVICE constexpr access_property() noexcept : __descriptor(__detail_ap::__sm_80::__interleave_normal()) {}
-    constexpr access_property(access_property const&) noexcept = default;
-    access_property& operator=(const access_property& other) noexcept = default;
-
-    _CCCL_HOST_DEVICE constexpr access_property(normal, float __fraction) : __descriptor(__detail_ap::__interleave(normal{}, __fraction)) {}
-    _CCCL_HOST_DEVICE constexpr access_property(streaming, float __fraction) : __descriptor(__detail_ap::__interleave(streaming{}, __fraction)) {}
-    _CCCL_HOST_DEVICE constexpr access_property(persisting, float __fraction) : __descriptor(__detail_ap::__interleave(persisting{}, __fraction)) {}
-    _CCCL_HOST_DEVICE constexpr access_property(normal, float __fraction, streaming) : __descriptor(__detail_ap::__interleave(normal{}, __fraction, streaming{})) {}
-    _CCCL_HOST_DEVICE constexpr access_property(persisting, float __fraction, streaming) : __descriptor(__detail_ap::__interleave(persisting{}, __fraction, streaming{})) {}
-
-    _CCCL_HOST_DEVICE constexpr access_property(normal) noexcept : access_property(normal{}, 1.0) {}
-    _CCCL_HOST_DEVICE constexpr access_property(streaming) noexcept : access_property(streaming{}, 1.0) {}
-    _CCCL_HOST_DEVICE constexpr access_property(persisting) noexcept : access_property(persisting{}, 1.0) {}
-
-    _CCCL_HOST_DEVICE constexpr access_property(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, normal)
-      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, normal{})) {}
-    _CCCL_HOST_DEVICE constexpr access_property(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, streaming)
-      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, streaming{})) {}
-    _CCCL_HOST_DEVICE constexpr access_property(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, persisting)
-      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, persisting{})) {}
-    _CCCL_HOST_DEVICE constexpr access_property(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, normal, streaming)
-      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, normal{}, streaming{})) {}
-    _CCCL_HOST_DEVICE constexpr access_property(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, persisting, streaming)
-      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, persisting{}, streaming{})) {}
-
-    _CCCL_HOST_DEVICE constexpr explicit operator std::uint64_t() const noexcept { return __descriptor; }
+class access_property
+{
+private:
+  std::uint64_t __descriptor = 0;
+
+public:
+  struct shared
+  {};
+  struct global
+  {};
+  struct persisting
+  {
+    _CCCL_HOST_DEVICE constexpr operator cudaAccessProperty() const noexcept
+    {
+      return cudaAccessProperty::cudaAccessPropertyPersisting;
+    }
+  };
+  struct streaming
+  {
+    _CCCL_HOST_DEVICE constexpr operator cudaAccessProperty() const noexcept
+    {
+      return cudaAccessProperty::cudaAccessPropertyStreaming;
+    }
+  };
+  struct normal
+  {
+    _CCCL_HOST_DEVICE constexpr operator cudaAccessProperty() const noexcept
+    {
+      return cudaAccessProperty::cudaAccessPropertyNormal;
+    }
+  };
+
+  _CCCL_HOST_DEVICE constexpr access_property(global) noexcept
+      : __descriptor(__detail_ap::__sm_80::__interleave_normal())
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property() noexcept
+      : __descriptor(__detail_ap::__sm_80::__interleave_normal())
+  {}
+  constexpr access_property(access_property const&) noexcept        = default;
+  access_property& operator=(const access_property& other) noexcept = default;
+
+  _CCCL_HOST_DEVICE constexpr access_property(normal, float __fraction)
+      : __descriptor(__detail_ap::__interleave(normal{}, __fraction))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(streaming, float __fraction)
+      : __descriptor(__detail_ap::__interleave(streaming{}, __fraction))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(persisting, float __fraction)
+      : __descriptor(__detail_ap::__interleave(persisting{}, __fraction))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(normal, float __fraction, streaming)
+      : __descriptor(__detail_ap::__interleave(normal{}, __fraction, streaming{}))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(persisting, float __fraction, streaming)
+      : __descriptor(__detail_ap::__interleave(persisting{}, __fraction, streaming{}))
+  {}
+
+  _CCCL_HOST_DEVICE constexpr access_property(normal) noexcept
+      : access_property(normal{}, 1.0)
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(streaming) noexcept
+      : access_property(streaming{}, 1.0)
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(persisting) noexcept
+      : access_property(persisting{}, 1.0)
+  {}
+
+  _CCCL_HOST_DEVICE constexpr access_property(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, normal)
+      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, normal{}))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, streaming)
+      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, streaming{}))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(
+    void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, persisting)
+      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, persisting{}))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(
+    void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, normal, streaming)
+      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, normal{}, streaming{}))
+  {}
+  _CCCL_HOST_DEVICE constexpr access_property(
+    void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, persisting, streaming)
+      : __descriptor(__detail_ap::__block(__ptr, __hit_bytes, __total_bytes, persisting{}, streaming{}))
+  {}
+
+  _CCCL_HOST_DEVICE constexpr explicit operator std::uint64_t() const noexcept
+  {
+    return __descriptor;
+  }
 };
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
@@ -130,195 +248,201 @@ _LIBCUDACXX_END_NAMESPACE_CUDA
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
 template <class _Tp, class _Property>
-_CCCL_HOST_DEVICE
-_Tp* associate_access_property(_Tp* __ptr, _Property __prop) {
+_CCCL_HOST_DEVICE _Tp* associate_access_property(_Tp* __ptr, _Property __prop)
+{
   static_assert(
-    std::is_same<_Property, access_property>::value ||
-    std::is_same<_Property, access_property::persisting>::value ||
-    std::is_same<_Property, access_property::streaming>::value ||
-    std::is_same<_Property, access_property::normal>::value ||
-    std::is_same<_Property, access_property::global>::value ||
-    std::is_same<_Property, access_property::shared>::value
-      , "property is not convertible to cuda::access_property");
+    std::is_same<_Property, access_property>::value || std::is_same<_Property, access_property::persisting>::value
+      || std::is_same<_Property, access_property::streaming>::value
+      || std::is_same<_Property, access_property::normal>::value
+      || std::is_same<_Property, access_property::global>::value
+      || std::is_same<_Property, access_property::shared>::value,
+    "property is not convertible to cuda::access_property");
   return __detail_ap::__associate(__ptr, __prop);
 }
 
 template <class _Shape>
-_CCCL_HOST_DEVICE
-void apply_access_property(const volatile void* __ptr, const _Shape __shape, access_property::persisting __prop) noexcept {
-  NV_IF_TARGET(NV_PROVIDES_SM_80,(
-    if (!__isGlobal((void*)__ptr)) return;
-
-    char* __p = reinterpret_cast<char*>(const_cast<void*>(__ptr));
-    static constexpr std::size_t _LINE_SIZE = 128;
-    std::size_t __nbytes = static_cast<std::size_t>(__shape);
-    std::size_t __end = ((std::uintptr_t)(__p + __nbytes) % _LINE_SIZE) ? __nbytes + _LINE_SIZE : __nbytes;
-    __end /= _LINE_SIZE;
-
-    //Apply to all 128 bytes aligned cache lines inclusive of __p
-    for (std::size_t __i = 0; __i < __end; __i += _LINE_SIZE) {
-      asm volatile ("prefetch.global.L2::evict_last [%0];" ::"l"(__p + (__i * _LINE_SIZE)) :);
-    }
-  ))
+_CCCL_HOST_DEVICE void
+apply_access_property(const volatile void* __ptr, const _Shape __shape, access_property::persisting __prop) noexcept
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_80,
+    (if (!__isGlobal((void*) __ptr)) return;
+
+     char* __p                               = reinterpret_cast<char*>(const_cast<void*>(__ptr));
+     static constexpr std::size_t _LINE_SIZE = 128;
+     std::size_t __nbytes                    = static_cast<std::size_t>(__shape);
+     std::size_t __end = ((std::uintptr_t)(__p + __nbytes) % _LINE_SIZE) ? __nbytes + _LINE_SIZE : __nbytes;
+     __end /= _LINE_SIZE;
+
+     // Apply to all 128 bytes aligned cache lines inclusive of __p
+     for (std::size_t __i = 0; __i < __end; __i += _LINE_SIZE) {
+       asm volatile("prefetch.global.L2::evict_last [%0];" ::"l"(__p + (__i * _LINE_SIZE)) :);
+     }))
 }
 
 template <class _Shape>
-_CCCL_HOST_DEVICE
-void apply_access_property(const volatile void* __ptr, const _Shape __shape, access_property::normal __prop) noexcept {
-  NV_IF_TARGET(NV_PROVIDES_SM_80,(
-    if (!__isGlobal((void*)__ptr)) return;
-
-    char* __p = reinterpret_cast<char*>(const_cast<void*>(__ptr));
-    static constexpr std::size_t _LINE_SIZE = 128;
-    std::size_t __nbytes = static_cast<std::size_t>(__shape);
-    std::size_t __end = ((std::uintptr_t)(__p + __nbytes) % _LINE_SIZE) ? __nbytes + _LINE_SIZE : __nbytes;
-    __end /= _LINE_SIZE;
-
-    //Apply to all 128 bytes aligned cache lines inclusive of __p
-    for (std::size_t __i = 0; __i < __end; __i += _LINE_SIZE) {
-      asm volatile ("prefetch.global.L2::evict_normal [%0];" ::"l"(__p + (__i * _LINE_SIZE)) :);
-    }
-  ))
+_CCCL_HOST_DEVICE void
+apply_access_property(const volatile void* __ptr, const _Shape __shape, access_property::normal __prop) noexcept
+{
+  NV_IF_TARGET(
+    NV_PROVIDES_SM_80,
+    (if (!__isGlobal((void*) __ptr)) return;
+
+     char* __p                               = reinterpret_cast<char*>(const_cast<void*>(__ptr));
+     static constexpr std::size_t _LINE_SIZE = 128;
+     std::size_t __nbytes                    = static_cast<std::size_t>(__shape);
+     std::size_t __end = ((std::uintptr_t)(__p + __nbytes) % _LINE_SIZE) ? __nbytes + _LINE_SIZE : __nbytes;
+     __end /= _LINE_SIZE;
+
+     // Apply to all 128 bytes aligned cache lines inclusive of __p
+     for (std::size_t __i = 0; __i < __end; __i += _LINE_SIZE) {
+       asm volatile("prefetch.global.L2::evict_normal [%0];" ::"l"(__p + (__i * _LINE_SIZE)) :);
+     }))
 }
 
-template<class _Tp, class _Property>
-class annotated_ptr: public __detail_ap::__annotated_ptr_base<_Property> {
-  public:
-    using value_type = _Tp;
-    using size_type = std::size_t;
-    using reference = value_type&;
-    using pointer = value_type*;
-    using const_pointer = value_type const*;
-    using difference_type = std::ptrdiff_t;
-
-  private:
-    using __self = annotated_ptr<_Tp, _Property>;
-
-    // Converting from a 64-bit to 32-bit shared pointer and maybe back just for storage might or might not be profitable.
-    pointer __repr = (pointer)((size_type)nullptr);
-
-    _CCCL_HOST_DEVICE pointer __get(bool __skip_prop = false, difference_type __n = 0) const {
-      NV_IF_TARGET(NV_IS_DEVICE,(
-        if (!__skip_prop) {
-          return static_cast<pointer>(this->__apply_prop(const_cast<void*>(static_cast<const volatile void*>(__repr + __n))));
-        }
-      ))
-      return __repr + __n;
-    }
-    _CCCL_HOST_DEVICE pointer __offset(difference_type __n, bool __skip_prop = false) const {
-      return __get(__skip_prop, __n);
-    }
-
-  public:
-    _CCCL_HOST_DEVICE pointer operator->() const {
-      return __get();
-    }
-
-    _CCCL_HOST_DEVICE reference operator*() const {
-      return *__get();
-    }
-
-    _CCCL_HOST_DEVICE reference operator[](difference_type __n) const {
-      return *__offset(__n);
-    }
-
-    _CCCL_HOST_DEVICE constexpr difference_type operator-(annotated_ptr o) const {
-      return __repr - o.__repr;
-    }
-
-    constexpr annotated_ptr() noexcept = default;
-    constexpr annotated_ptr(annotated_ptr const&) noexcept = default;
-    // No constexpr for c11 as the method can't be const
-    _CCCL_CONSTEXPR_CXX14 annotated_ptr& operator=(annotated_ptr const& other) noexcept = default;
-
-    _CCCL_HOST_DEVICE explicit annotated_ptr(pointer __p)
+template <class _Tp, class _Property>
+class annotated_ptr : public __detail_ap::__annotated_ptr_base<_Property>
+{
+public:
+  using value_type      = _Tp;
+  using size_type       = std::size_t;
+  using reference       = value_type&;
+  using pointer         = value_type*;
+  using const_pointer   = value_type const*;
+  using difference_type = std::ptrdiff_t;
+
+private:
+  using __self = annotated_ptr<_Tp, _Property>;
+
+  // Converting from a 64-bit to 32-bit shared pointer and maybe back just for storage might or might not be profitable.
+  pointer __repr = (pointer) ((size_type) nullptr);
+
+  _CCCL_HOST_DEVICE pointer __get(bool __skip_prop = false, difference_type __n = 0) const
+  {
+    NV_IF_TARGET(NV_IS_DEVICE, (if (!__skip_prop) {
+                   return static_cast<pointer>(
+                     this->__apply_prop(const_cast<void*>(static_cast<const volatile void*>(__repr + __n))));
+                 }))
+    return __repr + __n;
+  }
+  _CCCL_HOST_DEVICE pointer __offset(difference_type __n, bool __skip_prop = false) const
+  {
+    return __get(__skip_prop, __n);
+  }
+
+public:
+  _CCCL_HOST_DEVICE pointer operator->() const
+  {
+    return __get();
+  }
+
+  _CCCL_HOST_DEVICE reference operator*() const
+  {
+    return *__get();
+  }
+
+  _CCCL_HOST_DEVICE reference operator[](difference_type __n) const
+  {
+    return *__offset(__n);
+  }
+
+  _CCCL_HOST_DEVICE constexpr difference_type operator-(annotated_ptr o) const
+  {
+    return __repr - o.__repr;
+  }
+
+  constexpr annotated_ptr() noexcept                     = default;
+  constexpr annotated_ptr(annotated_ptr const&) noexcept = default;
+  // No constexpr for c11 as the method can't be const
+  _CCCL_CONSTEXPR_CXX14 annotated_ptr& operator=(annotated_ptr const& other) noexcept = default;
+
+  _CCCL_HOST_DEVICE explicit annotated_ptr(pointer __p)
       : __repr(__p)
-    {
-      NV_IF_TARGET(NV_IS_DEVICE,(
-        _LIBCUDACXX_DEBUG_ASSERT((std::is_same<_Property, shared>::value && __isShared(__p) || __isGlobal(__p)), "");
-      ))
-    }
-
-    template <typename _RuntimeProperty>
-    _CCCL_HOST_DEVICE annotated_ptr(pointer __p, _RuntimeProperty __prop)
-      : __detail_ap::__annotated_ptr_base<_Property>(static_cast<std::uint64_t>(access_property(__prop))), __repr(__p)
-    {
-      static_assert(std::is_same<_Property, access_property>::value,
-		    "This method requires annotated_ptr<T, cuda::access_property>");
-      static_assert(std::is_same<_RuntimeProperty, access_property::global>::value ||
-                    std::is_same<_RuntimeProperty, access_property::normal>::value ||
-                    std::is_same<_RuntimeProperty, access_property::streaming>::value ||
-                    std::is_same<_RuntimeProperty, access_property::persisting>::value ||
-                    std::is_same<_RuntimeProperty, access_property>::value,
-                    "This method requires RuntimeProperty=global|normal|streaming|persisting|access_property");
-      NV_IF_TARGET(NV_IS_DEVICE,(
-        _LIBCUDACXX_DEBUG_ASSERT((__isGlobal(__p) == true), "");
-      ))
-    }
-
-    template<class _TTp, class _Prop>
-    _CCCL_HOST_DEVICE annotated_ptr(const annotated_ptr<_TTp,_Prop>& __other);
-
-    _CCCL_HOST_DEVICE constexpr explicit operator bool() const noexcept {
-      return __repr != nullptr;
-    }
-
-    _CCCL_HOST_DEVICE pointer get() const noexcept {
-      constexpr bool __is_shared = std::is_same<_Property, access_property::shared>::value;
-      return __is_shared ? __repr : &(*annotated_ptr<value_type, access_property::global>(__repr));
-    }
-
-    _CCCL_HOST_DEVICE _Property __property() const noexcept {
-      return this->__get_property();
-    }
+  {
+    NV_IF_TARGET(
+      NV_IS_DEVICE,
+      (_LIBCUDACXX_DEBUG_ASSERT((std::is_same<_Property, shared>::value && __isShared(__p) || __isGlobal(__p)), "");))
+  }
+
+  template <typename _RuntimeProperty>
+  _CCCL_HOST_DEVICE annotated_ptr(pointer __p, _RuntimeProperty __prop)
+      : __detail_ap::__annotated_ptr_base<_Property>(static_cast<std::uint64_t>(access_property(__prop)))
+      , __repr(__p)
+  {
+    static_assert(std::is_same<_Property, access_property>::value,
+                  "This method requires annotated_ptr<T, cuda::access_property>");
+    static_assert(
+      std::is_same<_RuntimeProperty, access_property::global>::value
+        || std::is_same<_RuntimeProperty, access_property::normal>::value
+        || std::is_same<_RuntimeProperty, access_property::streaming>::value
+        || std::is_same<_RuntimeProperty, access_property::persisting>::value
+        || std::is_same<_RuntimeProperty, access_property>::value,
+      "This method requires RuntimeProperty=global|normal|streaming|persisting|access_property");
+    NV_IF_TARGET(NV_IS_DEVICE, (_LIBCUDACXX_DEBUG_ASSERT((__isGlobal(__p) == true), "");))
+  }
+
+  template <class _TTp, class _Prop>
+  _CCCL_HOST_DEVICE annotated_ptr(const annotated_ptr<_TTp, _Prop>& __other);
+
+  _CCCL_HOST_DEVICE constexpr explicit operator bool() const noexcept
+  {
+    return __repr != nullptr;
+  }
+
+  _CCCL_HOST_DEVICE pointer get() const noexcept
+  {
+    constexpr bool __is_shared = std::is_same<_Property, access_property::shared>::value;
+    return __is_shared ? __repr : &(*annotated_ptr<value_type, access_property::global>(__repr));
+  }
+
+  _CCCL_HOST_DEVICE _Property __property() const noexcept
+  {
+    return this->__get_property();
+  }
 };
 
-
-template<class _Tp, class _Property>
-template<class _TTp, class _Prop>
-_CCCL_HOST_DEVICE annotated_ptr<_Tp, _Property>::annotated_ptr(const annotated_ptr<_TTp,_Prop>& __other)
-  : __detail_ap::__annotated_ptr_base<_Property>(__other.__property()), __repr(__other.get())
+template <class _Tp, class _Property>
+template <class _TTp, class _Prop>
+_CCCL_HOST_DEVICE annotated_ptr<_Tp, _Property>::annotated_ptr(const annotated_ptr<_TTp, _Prop>& __other)
+    : __detail_ap::__annotated_ptr_base<_Property>(__other.__property())
+    , __repr(__other.get())
 {
   static_assert(std::is_assignable<pointer&, _TTp*>::value, "pointer must be assignable from other pointer");
-  static_assert((std::is_same<_Property, access_property>::value && !std::is_same<_Prop, access_property::shared>::value) ||
-		std::is_same<_Property, _Prop>::value, "Property must be either access_property or other property, and both properties must have same address space");
+  static_assert(
+    (std::is_same<_Property, access_property>::value && !std::is_same<_Prop, access_property::shared>::value)
+      || std::is_same<_Property, _Prop>::value,
+    "Property must be either access_property or other property, and both properties must have same address space");
   // note: precondition "__other.__rep must be compatible with _Property" currently always holds
 }
 
-template<class _Dst, class _Src, class _SrcProperty, class _Shape, class _Sync>
-_CCCL_HOST_DEVICE
-void memcpy_async(_Dst* __dst,
-    annotated_ptr<_Src,_SrcProperty> __src,
-    _Shape __shape, _Sync & __sync) {
+template <class _Dst, class _Src, class _SrcProperty, class _Shape, class _Sync>
+_CCCL_HOST_DEVICE void memcpy_async(_Dst* __dst, annotated_ptr<_Src, _SrcProperty> __src, _Shape __shape, _Sync& __sync)
+{
   memcpy_async(__dst, &(*__src), __shape, __sync);
 }
 
-template<class _Dst, class _DstProperty, class _Src, class _SrcProperty,
-  class _Shape, class _Sync>
-_CCCL_HOST_DEVICE
-void memcpy_async(annotated_ptr<_Dst,_DstProperty> __dst,
-    annotated_ptr<_Src,_SrcProperty> __src,
-    _Shape __shape, _Sync & __sync){
+template <class _Dst, class _DstProperty, class _Src, class _SrcProperty, class _Shape, class _Sync>
+_CCCL_HOST_DEVICE void memcpy_async(
+  annotated_ptr<_Dst, _DstProperty> __dst, annotated_ptr<_Src, _SrcProperty> __src, _Shape __shape, _Sync& __sync)
+{
   memcpy_async(&(*__dst), &(*__src), __shape, __sync);
 }
 
-template<class _Group, class _Dst, class _Src, class _SrcProperty,
-  class _Shape, class _Sync>
-_CCCL_HOST_DEVICE
-void memcpy_async(const _Group & __group,
-    _Dst * __dst,
-    annotated_ptr<_Src,_SrcProperty> __src,
-    _Shape __shape, _Sync & __sync) {
+template <class _Group, class _Dst, class _Src, class _SrcProperty, class _Shape, class _Sync>
+_CCCL_HOST_DEVICE void
+memcpy_async(const _Group& __group, _Dst* __dst, annotated_ptr<_Src, _SrcProperty> __src, _Shape __shape, _Sync& __sync)
+{
   memcpy_async(__group, __dst, &(*__src), __shape, __sync);
 }
 
-template<class _Group, class _Dst, class _DstProperty, class _Src, class _SrcProperty,
-  class _Shape, class _Sync>
-_CCCL_HOST_DEVICE
-void memcpy_async(const _Group & __group,
-    annotated_ptr<_Dst,_DstProperty> __dst,
-    annotated_ptr<_Src,_SrcProperty> __src,
-    _Shape __shape, _Sync & __sync) {
+template <class _Group, class _Dst, class _DstProperty, class _Src, class _SrcProperty, class _Shape, class _Sync>
+_CCCL_HOST_DEVICE void memcpy_async(
+  const _Group& __group,
+  annotated_ptr<_Dst, _DstProperty> __dst,
+  annotated_ptr<_Src, _SrcProperty> __src,
+  _Shape __shape,
+  _Sync& __sync)
+{
   memcpy_async(__group, &(*__dst), &(*__src), __shape, __sync);
 }
 
diff --git a/libcudacxx/include/cuda/barrier b/libcudacxx/include/cuda/barrier
index e19684cfece..99117dde90b 100644
--- a/libcudacxx/include/cuda/barrier
+++ b/libcudacxx/include/cuda/barrier
@@ -21,8 +21,8 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/barrier>
 #include <cuda/ptx>
+#include <cuda/std/barrier>
 
 // Forward-declare CUtensorMap for use in cp_async_bulk_tensor_* PTX wrapping
 // functions. These functions take a pointer to CUtensorMap, so do not need to
@@ -54,175 +54,185 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
 #ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-inline _CCCL_DEVICE
-void cp_async_bulk_global_to_shared(void *__dest, const void *__src, _CUDA_VSTD::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
+inline _CCCL_DEVICE void cp_async_bulk_global_to_shared(
+  void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
 {
-    _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0,   "Size must be multiple of 16.");
-    _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
-    _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src),  "Source must be global memory address.");
-
-    _CUDA_VPTX::cp_async_bulk(
-        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
-        __dest, __src, __size,
-        ::cuda::device::barrier_native_handle(__bar));
+  _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
+  _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
+  _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "Source must be global memory address.");
+
+  _CUDA_VPTX::cp_async_bulk(
+    _CUDA_VPTX::space_cluster,
+    _CUDA_VPTX::space_global,
+    __dest,
+    __src,
+    __size,
+    ::cuda::device::barrier_native_handle(__bar));
 }
 
-
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
-inline _CCCL_DEVICE
-void cp_async_bulk_shared_to_global(void *__dest, const void * __src, _CUDA_VSTD::uint32_t __size)
+inline _CCCL_DEVICE void cp_async_bulk_shared_to_global(void* __dest, const void* __src, _CUDA_VSTD::uint32_t __size)
 {
-    _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0,   "Size must be multiple of 16.");
-    _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
-    _LIBCUDACXX_DEBUG_ASSERT(__isShared(__src),  "Source must be shared memory address.");
+  _LIBCUDACXX_DEBUG_ASSERT(__size % 16 == 0, "Size must be multiple of 16.");
+  _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
+  _LIBCUDACXX_DEBUG_ASSERT(__isShared(__src), "Source must be shared memory address.");
 
-    _CUDA_VPTX::cp_async_bulk(
-        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
-        __dest, __src, __size);
+  _CUDA_VPTX::cp_async_bulk(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __dest, __src, __size);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_1d_global_to_shared(
-    void *__dest, const CUtensorMap *__tensor_map , int __c0, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_1d_global_to_shared(
+  void* __dest, const CUtensorMap* __tensor_map, int __c0, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0};
-
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
-        __dest, __tensor_map, __coords,
-        ::cuda::device::barrier_native_handle(__bar));
+  const _CUDA_VSTD::int32_t __coords[]{__c0};
+
+  _CUDA_VPTX::cp_async_bulk_tensor(
+    _CUDA_VPTX::space_cluster,
+    _CUDA_VPTX::space_global,
+    __dest,
+    __tensor_map,
+    __coords,
+    ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_2d_global_to_shared(
-    void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_2d_global_to_shared(
+  void* __dest, const CUtensorMap* __tensor_map, int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block>& __bar)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};
-
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
-        __dest, __tensor_map, __coords,
-        ::cuda::device::barrier_native_handle(__bar));
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};
+
+  _CUDA_VPTX::cp_async_bulk_tensor(
+    _CUDA_VPTX::space_cluster,
+    _CUDA_VPTX::space_global,
+    __dest,
+    __tensor_map,
+    __coords,
+    ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_3d_global_to_shared(
-    void *__dest, const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_global_to_shared(
+  void* __dest,
+  const CUtensorMap* __tensor_map,
+  int __c0,
+  int __c1,
+  int __c2,
+  ::cuda::barrier<::cuda::thread_scope_block>& __bar)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};
-
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
-        __dest, __tensor_map, __coords,
-        ::cuda::device::barrier_native_handle(__bar));
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};
+
+  _CUDA_VPTX::cp_async_bulk_tensor(
+    _CUDA_VPTX::space_cluster,
+    _CUDA_VPTX::space_global,
+    __dest,
+    __tensor_map,
+    __coords,
+    ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_4d_global_to_shared(
-    void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, int __c2, int __c3, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_global_to_shared(
+  void* __dest,
+  const CUtensorMap* __tensor_map,
+  int __c0,
+  int __c1,
+  int __c2,
+  int __c3,
+  ::cuda::barrier<::cuda::thread_scope_block>& __bar)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};
-
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
-        __dest, __tensor_map, __coords,
-        ::cuda::device::barrier_native_handle(__bar));
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};
+
+  _CUDA_VPTX::cp_async_bulk_tensor(
+    _CUDA_VPTX::space_cluster,
+    _CUDA_VPTX::space_global,
+    __dest,
+    __tensor_map,
+    __coords,
+    ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_5d_global_to_shared(
-    void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, int __c2, int __c3, int __c4, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_global_to_shared(
+  void* __dest,
+  const CUtensorMap* __tensor_map,
+  int __c0,
+  int __c1,
+  int __c2,
+  int __c3,
+  int __c4,
+  ::cuda::barrier<::cuda::thread_scope_block>& __bar)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
-
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
-        __dest, __tensor_map, __coords,
-        ::cuda::device::barrier_native_handle(__bar));
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
+
+  _CUDA_VPTX::cp_async_bulk_tensor(
+    _CUDA_VPTX::space_cluster,
+    _CUDA_VPTX::space_global,
+    __dest,
+    __tensor_map,
+    __coords,
+    ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_1d_shared_to_global(
-    const CUtensorMap *__tensor_map, int __c0, const void *__src)
+inline _CCCL_DEVICE void
+cp_async_bulk_tensor_1d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, const void* __src)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0};
+  const _CUDA_VSTD::int32_t __coords[]{__c0};
 
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
-        __tensor_map, __coords, __src);
+  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_2d_shared_to_global(
-    const CUtensorMap *__tensor_map, int __c0, int __c1, const void *__src)
+inline _CCCL_DEVICE void
+cp_async_bulk_tensor_2d_shared_to_global(const CUtensorMap* __tensor_map, int __c0, int __c1, const void* __src)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};
 
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
-        __tensor_map, __coords, __src);
+  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_3d_shared_to_global(
-    const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, const void *__src)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_3d_shared_to_global(
+  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, const void* __src)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};
 
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
-        __tensor_map, __coords, __src);
+  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_4d_shared_to_global(
-    const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, int __c3, const void *__src)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_4d_shared_to_global(
+  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, const void* __src)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};
 
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
-        __tensor_map, __coords, __src);
+  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
-inline _CCCL_DEVICE
-void cp_async_bulk_tensor_5d_shared_to_global(
-    const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void *__src)
+inline _CCCL_DEVICE void cp_async_bulk_tensor_5d_shared_to_global(
+  const CUtensorMap* __tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void* __src)
 {
-    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
+  const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
 
-    _CUDA_VPTX::cp_async_bulk_tensor(
-        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
-        __tensor_map, __coords, __src);
+  _CUDA_VPTX::cp_async_bulk_tensor(_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared, __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
-inline _CCCL_DEVICE
-void fence_proxy_async_shared_cta() {
-    _CUDA_VPTX::fence_proxy_async(_CUDA_VPTX::space_shared);
+inline _CCCL_DEVICE void fence_proxy_async_shared_cta()
+{
+  _CUDA_VPTX::fence_proxy_async(_CUDA_VPTX::space_shared);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
-inline _CCCL_DEVICE
-void cp_async_bulk_commit_group()
+inline _CCCL_DEVICE void cp_async_bulk_commit_group()
 {
-    _CUDA_VPTX::cp_async_bulk_commit_group();
+  _CUDA_VPTX::cp_async_bulk_commit_group();
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
 template <int __n_prior>
-inline _CCCL_DEVICE
-void cp_async_bulk_wait_group_read()
+inline _CCCL_DEVICE void cp_async_bulk_wait_group_read()
 {
   static_assert(__n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported.");
   _CUDA_VPTX::cp_async_bulk_wait_group_read(_CUDA_VPTX::n32_t<__n_prior>{});
diff --git a/libcudacxx/include/cuda/discard_memory b/libcudacxx/include/cuda/discard_memory
index 5893bf6108e..d6c772d57a2 100644
--- a/libcudacxx/include/cuda/discard_memory
+++ b/libcudacxx/include/cuda/discard_memory
@@ -21,8 +21,8 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/cstdint>
 #include <cuda/ptx>
+#include <cuda/std/cstdint>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
@@ -37,14 +37,14 @@ inline _CCCL_HOST_DEVICE void discard_memory(volatile void* __ptr, size_t __nbyt
     NV_PROVIDES_SM_80,
     (if (!__isGlobal((void*) __ptr)) return;
 
-     char* __p                               = reinterpret_cast<char*>(const_cast<void*>(__ptr));
-     char* const __end_p                     = __p + __nbytes;
+     char* __p                          = reinterpret_cast<char*>(const_cast<void*>(__ptr));
+     char* const __end_p                = __p + __nbytes;
      static constexpr size_t _LINE_SIZE = 128;
 
      // Trim the first block and last block if they're not 128 bytes aligned
-     size_t __misalignment = reinterpret_cast<uintptr_t>(__p) % _LINE_SIZE;
-     char* __start_aligned      = __misalignment == 0 ? __p : __p + (_LINE_SIZE - __misalignment);
-     char* const __end_aligned  = __end_p - (reinterpret_cast<uintptr_t>(__end_p) % _LINE_SIZE);
+     size_t __misalignment     = reinterpret_cast<uintptr_t>(__p) % _LINE_SIZE;
+     char* __start_aligned     = __misalignment == 0 ? __p : __p + (_LINE_SIZE - __misalignment);
+     char* const __end_aligned = __end_p - (reinterpret_cast<uintptr_t>(__end_p) % _LINE_SIZE);
 
      while (__start_aligned < __end_aligned) {
        asm volatile("discard.global.L2 [%0], 128;" ::"l"(__start_aligned) :);
diff --git a/libcudacxx/include/cuda/functional b/libcudacxx/include/cuda/functional
index 955631e23a5..f8aaef4f0a9 100644
--- a/libcudacxx/include/cuda/functional
+++ b/libcudacxx/include/cuda/functional
@@ -4,50 +4,128 @@
  *
  * NVIDIA SOFTWARE LICENSE
  *
- * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
+ * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the
+ * NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
  *
- * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users.
+ * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used.
+ * If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By
+ * taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of
+ * this license, and you take legal and financial responsibility for the actions of your permitted users.
  *
- * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions.
+ * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law,
+ * regulation or generally accepted practices or guidelines in the relevant jurisdictions.
  *
- * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license.
+ * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install
+ * and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this
+ * license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under
+ * this license.
  *
  * 2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant:
- * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights.
- * b.      You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE.
+ * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license,
+ * including (without limitation) terms relating to the license grant and license restrictions and protection of
+ * NVIDIA’s intellectual property rights. b.      You agree to notify NVIDIA in writing of any known or suspected
+ * distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms
+ * of your agreements with respect to distributed SOFTWARE.
  *
  * 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows:
  * a.      The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs.
- * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE.
- * c.      You may not modify or create derivative works of any portion of the SOFTWARE.
- * d.      You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE.
- * e.      You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge.
- * f.      Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses.
- * g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
- *
- * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems.
- *
- * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE.
- *
- * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict.
- *
- * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice.
- *
- * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
- *
- * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT.
- *
- * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you.
- *
- * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
- *
- * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect.
- *
- * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE.
- *
- * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
- *
- * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party.
+ * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from
+ * any portion of the SOFTWARE or copies of the SOFTWARE. c.      You may not modify or create derivative works of any
+ * portion of the SOFTWARE. d.      You may not bypass, disable, or circumvent any technical measure, encryption,
+ * security, digital rights management or authentication mechanism in the SOFTWARE. e.      You may not use the SOFTWARE
+ * in any manner that would cause it to become subject to an open source software license. As examples, licenses that
+ * require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in
+ * source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. f.
+ * Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or
+ * application where the use or failure of the system or application can reasonably be expected to threaten or result in
+ * personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life
+ * support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these
+ * critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or
+ * damages arising from such uses. g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates,
+ * and their respective employees, contractors, agents, officers and directors, from and against any and all claims,
+ * damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited
+ * to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use
+ * of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
+ *
+ * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may
+ * not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy,
+ * availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use
+ * a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in
+ * production or business-critical systems.
+ *
+ * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and
+ * exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United
+ * States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time
+ * without notice, but is not obligated to support or update the SOFTWARE.
+ *
+ * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal
+ * notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is
+ * a conflict between the terms in this license and the license terms associated with a component, the license terms
+ * associated with the components control only to the extent necessary to resolve the conflict.
+ *
+ * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes,
+ * enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you
+ * voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable
+ * license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute
+ * (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA
+ * will use Feedback at its choice.
+ *
+ * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT
+ * NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT
+ * WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR
+ * ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
+ *
+ * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE
+ * FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE,
+ * LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH
+ * THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON
+ * BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION
+ * OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE
+ * POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING
+ * OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE
+ * OR EXTEND THIS LIMIT.
+ *
+ * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail
+ * to comply with any term and condition of this license or if you commence or participate in any legal proceeding
+ * against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if
+ * NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of
+ * it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of
+ * the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this
+ * license are not affected by the termination of this license. All provisions of this license will survive termination,
+ * except for the license granted to you.
+ *
+ * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State
+ * of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware
+ * residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the
+ * International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English
+ * language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction
+ * over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be
+ * allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+ *
+ * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or
+ * operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be
+ * void and of no effect.
+ *
+ * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship,
+ * transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States
+ * Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s
+ * Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws
+ * include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not
+ * a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from
+ * receiving the SOFTWARE.
+ *
+ * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting
+ * of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS.
+ * Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the
+ * restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the
+ * Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is
+ * NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
+ *
+ * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the
+ * subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to
+ * this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of
+ * this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect.
+ * This license may only be modified in a writing signed by an authorized representative of each party.
  *
  * (v. August 20, 2021)
  */
@@ -65,8 +143,8 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/type_traits>
 #include <cuda/std/functional>
+#include <cuda/std/type_traits>
 #include <cuda/std/utility>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
@@ -74,90 +152,72 @@ namespace __detail
 {
 
 template <class _Ret, class _DecayFn>
-class __return_type_wrapper {
- private:
+class __return_type_wrapper
+{
+private:
   _DecayFn __fn_;
 
- public:
+public:
   __return_type_wrapper() = delete;
 
   template <class _Fn,
             class = _CUDA_VSTD::__enable_if_t<_CUDA_VSTD::is_same<_CUDA_VSTD::__decay_t<_Fn>, _DecayFn>::value>>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  explicit __return_type_wrapper(_Fn &&__fn) noexcept
-    : __fn_(_CUDA_VSTD::forward<_Fn>(__fn)) {}
+  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 explicit __return_type_wrapper(_Fn&& __fn) noexcept
+      : __fn_(_CUDA_VSTD::forward<_Fn>(__fn))
+  {}
 
   template <class... _As>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  _Ret operator()(_As&&... __as) & noexcept {
+  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) & noexcept
+  {
 #if !defined(__NVCC__) || defined(__CUDA_ARCH__)
-    static_assert(
-        _CUDA_VSTD::is_same<
-            _Ret,
-            typename _CUDA_VSTD::__invoke_of<_DecayFn&, _As...>::type
-          >::value,
-        "Return type shall match the proclaimed one exactly");
+    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<_DecayFn&, _As...>::type>::value,
+                  "Return type shall match the proclaimed one exactly");
 #endif
 
     return _CUDA_VSTD::__invoke(__fn_, _CUDA_VSTD::forward<_As>(__as)...);
   }
 
   template <class... _As>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  _Ret operator()(_As&&... __as) && noexcept {
+  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) && noexcept
+  {
 #if !defined(__NVCC__) || defined(__CUDA_ARCH__)
-    static_assert(
-        _CUDA_VSTD::is_same<
-            _Ret,
-            typename _CUDA_VSTD::__invoke_of<_DecayFn, _As...>::type
-          >::value,
-        "Return type shall match the proclaimed one exactly");
+    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<_DecayFn, _As...>::type>::value,
+                  "Return type shall match the proclaimed one exactly");
 #endif
 
-    return _CUDA_VSTD::__invoke(_CUDA_VSTD::move(__fn_),
-                                _CUDA_VSTD::forward<_As>(__as)...);
+    return _CUDA_VSTD::__invoke(_CUDA_VSTD::move(__fn_), _CUDA_VSTD::forward<_As>(__as)...);
   }
 
   template <class... _As>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  _Ret operator()(_As&&... __as) const& noexcept {
+  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) const& noexcept
+  {
 #if !defined(__NVCC__) || defined(__CUDA_ARCH__)
-    static_assert(
-        _CUDA_VSTD::is_same<
-            _Ret,
-            typename _CUDA_VSTD::__invoke_of<const _DecayFn&, _As...>::type
-          >::value,
-        "Return type shall match the proclaimed one exactly");
+    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<const _DecayFn&, _As...>::type>::value,
+                  "Return type shall match the proclaimed one exactly");
 #endif
 
     return _CUDA_VSTD::__invoke(__fn_, _CUDA_VSTD::forward<_As>(__as)...);
   }
 
   template <class... _As>
-  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14
-  _Ret operator()(_As&&... __as) const&& noexcept {
+  _LIBCUDACXX_INLINE_VISIBILITY _CCCL_CONSTEXPR_CXX14 _Ret operator()(_As&&... __as) const&& noexcept
+  {
 #if !defined(__NVCC__) || defined(__CUDA_ARCH__)
-    static_assert(
-        _CUDA_VSTD::is_same<
-            _Ret,
-            typename _CUDA_VSTD::__invoke_of<const _DecayFn, _As...>::type
-          >::value,
-        "Return type shall match the proclaimed one exactly");
+    static_assert(_CUDA_VSTD::is_same<_Ret, typename _CUDA_VSTD::__invoke_of<const _DecayFn, _As...>::type>::value,
+                  "Return type shall match the proclaimed one exactly");
 #endif
 
-    return _CUDA_VSTD::__invoke(_CUDA_VSTD::move(__fn_),
-                                _CUDA_VSTD::forward<_As>(__as)...);
+    return _CUDA_VSTD::__invoke(_CUDA_VSTD::move(__fn_), _CUDA_VSTD::forward<_As>(__as)...);
   }
 };
 
-}  // __detail
+} // namespace __detail
 
 template <class _Ret, class _Fn>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-__detail::__return_type_wrapper<_Ret, _CUDA_VSTD::__decay_t<_Fn>>
-proclaim_return_type(_Fn&& __fn) noexcept {
-  return __detail::__return_type_wrapper<_Ret, _CUDA_VSTD::__decay_t<_Fn>>(
-      _CUDA_VSTD::forward<_Fn>(__fn));
+inline _LIBCUDACXX_INLINE_VISIBILITY __detail::__return_type_wrapper<_Ret, _CUDA_VSTD::__decay_t<_Fn>>
+proclaim_return_type(_Fn&& __fn) noexcept
+{
+  return __detail::__return_type_wrapper<_Ret, _CUDA_VSTD::__decay_t<_Fn>>(_CUDA_VSTD::forward<_Fn>(__fn));
 }
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
diff --git a/libcudacxx/include/cuda/memory_resource b/libcudacxx/include/cuda/memory_resource
index a138995aa5f..894fd9eb2dd 100644
--- a/libcudacxx/include/cuda/memory_resource
+++ b/libcudacxx/include/cuda/memory_resource
@@ -80,17 +80,17 @@ class resource_ref {
 */
 // clang-format on
 
-#  include <cuda_runtime_api.h> // cuda_runtime_api needs to come first
+#include <cuda_runtime_api.h> // cuda_runtime_api needs to come first
 
-#  include "__cccl_config"
+#include "__cccl_config"
 
-#  if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#    pragma GCC system_header
-#  elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#    pragma clang system_header
-#  elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#    pragma system_header
-#  endif // no system header
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
 
 #include <cuda/__memory_resource/cuda_managed_memory_resource.h>
 #include <cuda/__memory_resource/cuda_memory_resource.h>
diff --git a/libcudacxx/include/cuda/pipeline b/libcudacxx/include/cuda/pipeline
index 509dfd65cbe..583a6fb6c72 100644
--- a/libcudacxx/include/cuda/pipeline
+++ b/libcudacxx/include/cuda/pipeline
@@ -3,50 +3,128 @@
  *
  * NVIDIA SOFTWARE LICENSE
  *
- * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
+ * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the
+ * NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
  *
- * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users.
+ * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used.
+ * If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By
+ * taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of
+ * this license, and you take legal and financial responsibility for the actions of your permitted users.
  *
- * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions.
+ * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law,
+ * regulation or generally accepted practices or guidelines in the relevant jurisdictions.
  *
- * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license.
+ * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install
+ * and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this
+ * license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under
+ * this license.
  *
  * 2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant:
- * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights.
- * b.      You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE.
+ * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license,
+ * including (without limitation) terms relating to the license grant and license restrictions and protection of
+ * NVIDIA’s intellectual property rights. b.      You agree to notify NVIDIA in writing of any known or suspected
+ * distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms
+ * of your agreements with respect to distributed SOFTWARE.
  *
  * 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows:
  * a.      The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs.
- * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE.
- * c.      You may not modify or create derivative works of any portion of the SOFTWARE.
- * d.      You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE.
- * e.      You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge.
- * f.      Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses.
- * g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
+ * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from
+ * any portion of the SOFTWARE or copies of the SOFTWARE. c.      You may not modify or create derivative works of any
+ * portion of the SOFTWARE. d.      You may not bypass, disable, or circumvent any technical measure, encryption,
+ * security, digital rights management or authentication mechanism in the SOFTWARE. e.      You may not use the SOFTWARE
+ * in any manner that would cause it to become subject to an open source software license. As examples, licenses that
+ * require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in
+ * source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. f.
+ * Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or
+ * application where the use or failure of the system or application can reasonably be expected to threaten or result in
+ * personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life
+ * support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these
+ * critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or
+ * damages arising from such uses. g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates,
+ * and their respective employees, contractors, agents, officers and directors, from and against any and all claims,
+ * damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited
+ * to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use
+ * of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
  *
- * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems.
+ * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may
+ * not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy,
+ * availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use
+ * a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in
+ * production or business-critical systems.
  *
- * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE.
+ * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and
+ * exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United
+ * States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time
+ * without notice, but is not obligated to support or update the SOFTWARE.
  *
- * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict.
+ * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal
+ * notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is
+ * a conflict between the terms in this license and the license terms associated with a component, the license terms
+ * associated with the components control only to the extent necessary to resolve the conflict.
  *
- * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice.
+ * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes,
+ * enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you
+ * voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable
+ * license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute
+ * (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA
+ * will use Feedback at its choice.
  *
- * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
+ * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT
+ * NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT
+ * WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR
+ * ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
  *
- * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT.
+ * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE
+ * FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE,
+ * LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH
+ * THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON
+ * BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION
+ * OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE
+ * POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING
+ * OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE
+ * OR EXTEND THIS LIMIT.
  *
- * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you.
+ * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail
+ * to comply with any term and condition of this license or if you commence or participate in any legal proceeding
+ * against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if
+ * NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of
+ * it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of
+ * the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this
+ * license are not affected by the termination of this license. All provisions of this license will survive termination,
+ * except for the license granted to you.
  *
- * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+ * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State
+ * of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware
+ * residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the
+ * International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English
+ * language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction
+ * over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be
+ * allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
  *
- * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect.
+ * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or
+ * operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be
+ * void and of no effect.
  *
- * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE.
+ * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship,
+ * transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States
+ * Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s
+ * Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws
+ * include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not
+ * a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from
+ * receiving the SOFTWARE.
  *
- * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
+ * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting
+ * of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS.
+ * Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the
+ * restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the
+ * Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is
+ * NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
  *
- * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party.
+ * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the
+ * subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to
+ * this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of
+ * this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect.
+ * This license may only be modified in a writing signed by an authorized representative of each party.
  *
  * (v. August 20, 2021)
  */
@@ -63,532 +141,563 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/barrier>
 #include <cuda/atomic>
+#include <cuda/barrier>
 #include <cuda/std/chrono>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-    // Forward declaration in barrier of pipeline
-    enum class pipeline_role {
-        producer,
-        consumer
-    };
-
-    template<thread_scope _Scope>
-    struct __pipeline_stage {
-        barrier<_Scope> __produced;
-        barrier<_Scope> __consumed;
-    };
-
-    template<thread_scope _Scope, uint8_t _Stages_count>
-    class pipeline_shared_state {
-    public:
-        pipeline_shared_state() = default;
-        pipeline_shared_state(const pipeline_shared_state &) = delete;
-        pipeline_shared_state(pipeline_shared_state &&) = delete;
-        pipeline_shared_state & operator=(pipeline_shared_state &&) = delete;
-        pipeline_shared_state & operator=(const pipeline_shared_state &) =  delete;
-
-    private:
-        __pipeline_stage<_Scope> __stages[_Stages_count];
-        atomic<uint32_t, _Scope> __refcount;
-
-        template<thread_scope _Pipeline_scope>
-        friend class pipeline;
-
-        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state);
-
-        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, size_t __producer_count);
-
-        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, pipeline_role __role);
-    };
-
-    struct __pipeline_asm_helper {
-        _CCCL_DEVICE
-        static inline uint32_t __lane_id()
-        {
-            NV_IF_ELSE_TARGET(
-                NV_IS_DEVICE,
-                (
-                    uint32_t __lane_id;
-                    asm volatile ("mov.u32 %0, %%laneid;" : "=r"(__lane_id));
-                    return __lane_id;
-                ),
-                (
-                    return 0;
-                )
-            )
-        }
-    };
-
-    template<thread_scope _Scope>
-    class pipeline {
-    public:
-        pipeline(pipeline &&) = default;
-        pipeline(const pipeline &) = delete;
-        pipeline & operator=(pipeline &&) = delete;
-        pipeline & operator=(const pipeline &) = delete;
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        ~pipeline()
-        {
-            if (__active) {
-                (void)quit();
-            }
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        bool quit()
-        {
-            bool __elected;
-            uint32_t __sub_count;
-NV_IF_TARGET(NV_IS_DEVICE,
-            const uint32_t __match_mask = __match_any_sync(__activemask(), reinterpret_cast<uintptr_t>(__shared_state_get_refcount()));
-            const uint32_t __elected_id = __ffs(__match_mask) - 1;
-            __elected = (__pipeline_asm_helper::__lane_id() == __elected_id);
-            __sub_count = __popc(__match_mask);
-,
-            __elected = true;
-            __sub_count = 1;
-)
-            bool __released = false;
-            if (__elected) {
-                const uint32_t __old = __shared_state_get_refcount()->fetch_sub(__sub_count);
-                const bool __last = (__old == __sub_count);
-                if (__last) {
-                    for (uint8_t __stage = 0; __stage < __stages_count; ++__stage) {
-                        __shared_state_get_stage(__stage)->__produced.~barrier();
-                        __shared_state_get_stage(__stage)->__consumed.~barrier();
-                    }
-                    __released = true;
-                }
-            }
-            __active = false;
-            return __released;
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void producer_acquire()
-        {
-            barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__head)->__consumed;
-            __stage_barrier.wait_parity(__consumed_phase_parity);
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void producer_commit()
-        {
-            barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__head)->__produced;
-            (void)__memcpy_completion_impl::__defer(__completion_mechanism::__async_group, __single_thread_group{}, 0, __stage_barrier);
-            (void)__stage_barrier.arrive();
-            if (++__head == __stages_count) {
-                __head = 0;
-                __consumed_phase_parity = !__consumed_phase_parity;
-            }
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void consumer_wait()
-        {
-            barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__tail)->__produced;
-            __stage_barrier.wait_parity(__produced_phase_parity);
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void consumer_release()
-        {
-            (void)__shared_state_get_stage(__tail)->__consumed.arrive();
-            if (++__tail == __stages_count) {
-                __tail = 0;
-                __produced_phase_parity = !__produced_phase_parity;
-            }
-        }
-
-        template<class _Rep, class _Period>
-        _LIBCUDACXX_INLINE_VISIBILITY
-        bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period> & __duration)
-        {
-            barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__tail)->__produced;
-            return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
-                        _CUDA_VSTD::__barrier_poll_tester_parity<barrier<_Scope>>(
-                            &__stage_barrier,
-                            __produced_phase_parity),
-                        _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__duration)
-            );
-        }
-
-        template<class _Clock, class _Duration>
-        _LIBCUDACXX_INLINE_VISIBILITY
-        bool consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration> & __time_point)
-        {
-            return consumer_wait_for(__time_point - _Clock::now());
-        }
-
-    private:
-        uint8_t __head               : 8;
-        uint8_t __tail               : 8;
-        const uint8_t __stages_count : 8;
-        bool __consumed_phase_parity : 1;
-        bool __produced_phase_parity : 1;
-        bool __active                : 1;
-        // TODO: Remove partitioned on next ABI break
-        const bool __partitioned     : 1;
-        char * const __shared_state;
-
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline(char * __shared_state, uint8_t __stages_count, bool __partitioned)
-            : __head(0)
-            , __tail(0)
-            , __stages_count(__stages_count)
-            , __consumed_phase_parity(true)
-            , __produced_phase_parity(false)
-            , __active(true)
-            , __partitioned(__partitioned)
-            , __shared_state(__shared_state)
-        {}
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        __pipeline_stage<_Scope> * __shared_state_get_stage(uint8_t __stage)
-        {
-            ptrdiff_t __stage_offset = __stage * sizeof(__pipeline_stage<_Scope>);
-            return reinterpret_cast<__pipeline_stage<_Scope>*>(__shared_state + __stage_offset);
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        atomic<uint32_t, _Scope> * __shared_state_get_refcount()
+// Forward declaration in barrier of pipeline
+enum class pipeline_role
+{
+  producer,
+  consumer
+};
+
+template <thread_scope _Scope>
+struct __pipeline_stage
+{
+  barrier<_Scope> __produced;
+  barrier<_Scope> __consumed;
+};
+
+template <thread_scope _Scope, uint8_t _Stages_count>
+class pipeline_shared_state
+{
+public:
+  pipeline_shared_state()                                        = default;
+  pipeline_shared_state(const pipeline_shared_state&)            = delete;
+  pipeline_shared_state(pipeline_shared_state&&)                 = delete;
+  pipeline_shared_state& operator=(pipeline_shared_state&&)      = delete;
+  pipeline_shared_state& operator=(const pipeline_shared_state&) = delete;
+
+private:
+  __pipeline_stage<_Scope> __stages[_Stages_count];
+  atomic<uint32_t, _Scope> __refcount;
+
+  template <thread_scope _Pipeline_scope>
+  friend class pipeline;
+
+  template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
+  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  make_pipeline(const _Group& __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state);
+
+  template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
+  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  make_pipeline(const _Group& __group,
+                pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state,
+                size_t __producer_count);
+
+  template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
+  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  make_pipeline(const _Group& __group,
+                pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state,
+                pipeline_role __role);
+};
+
+struct __pipeline_asm_helper
+{
+  _CCCL_DEVICE static inline uint32_t __lane_id()
+  {
+    NV_IF_ELSE_TARGET(
+      NV_IS_DEVICE,
+      (uint32_t __lane_id; asm volatile("mov.u32 %0, %%laneid;"
+                                        : "=r"(__lane_id));
+       return __lane_id;),
+      (return 0;))
+  }
+};
+
+template <thread_scope _Scope>
+class pipeline
+{
+public:
+  pipeline(pipeline&&)                 = default;
+  pipeline(const pipeline&)            = delete;
+  pipeline& operator=(pipeline&&)      = delete;
+  pipeline& operator=(const pipeline&) = delete;
+
+  _LIBCUDACXX_INLINE_VISIBILITY ~pipeline()
+  {
+    if (__active)
+    {
+      (void) quit();
+    }
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool quit()
+  {
+    bool __elected;
+    uint32_t __sub_count;
+    NV_IF_TARGET(
+      NV_IS_DEVICE,
+      const uint32_t __match_mask =
+        __match_any_sync(__activemask(), reinterpret_cast<uintptr_t>(__shared_state_get_refcount()));
+      const uint32_t __elected_id = __ffs(__match_mask) - 1;
+      __elected                   = (__pipeline_asm_helper::__lane_id() == __elected_id);
+      __sub_count                 = __popc(__match_mask);
+      , __elected = true;
+      __sub_count = 1;)
+    bool __released = false;
+    if (__elected)
+    {
+      const uint32_t __old = __shared_state_get_refcount()->fetch_sub(__sub_count);
+      const bool __last    = (__old == __sub_count);
+      if (__last)
+      {
+        for (uint8_t __stage = 0; __stage < __stages_count; ++__stage)
         {
-            ptrdiff_t __refcount_offset = __stages_count * sizeof(__pipeline_stage<_Scope>);
-            return reinterpret_cast<atomic<uint32_t, _Scope>*>(__shared_state + __refcount_offset);
+          __shared_state_get_stage(__stage)->__produced.~barrier();
+          __shared_state_get_stage(__stage)->__consumed.~barrier();
         }
-
-        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state);
-
-        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, size_t __producer_count);
-
-        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, pipeline_role __role);
-    };
-
-    template<class _Group, thread_scope _Scope, uint8_t _Stages_count>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state)
+        __released = true;
+      }
+    }
+    __active = false;
+    return __released;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void producer_acquire()
+  {
+    barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__head)->__consumed;
+    __stage_barrier.wait_parity(__consumed_phase_parity);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void producer_commit()
+  {
+    barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__head)->__produced;
+    (void) __memcpy_completion_impl::__defer(
+      __completion_mechanism::__async_group, __single_thread_group{}, 0, __stage_barrier);
+    (void) __stage_barrier.arrive();
+    if (++__head == __stages_count)
     {
-        const uint32_t __group_size = static_cast<uint32_t>(__group.size());
-        const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());
-
-        if (__thread_rank == 0) {
-            for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) {
-                init(&__shared_state->__stages[__stage].__consumed, __group_size);
-                init(&__shared_state->__stages[__stage].__produced, __group_size);
-            }
-            __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
-        }
-        __group.sync();
-
-        return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, false);
+      __head                  = 0;
+      __consumed_phase_parity = !__consumed_phase_parity;
     }
-
-    template<class _Group, thread_scope _Scope, uint8_t _Stages_count>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state, size_t __producer_count)
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void consumer_wait()
+  {
+    barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__tail)->__produced;
+    __stage_barrier.wait_parity(__produced_phase_parity);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void consumer_release()
+  {
+    (void) __shared_state_get_stage(__tail)->__consumed.arrive();
+    if (++__tail == __stages_count)
     {
-        const uint32_t __group_size = static_cast<uint32_t>(__group.size());
-        const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());
-
-        if (__thread_rank == 0) {
-            const size_t __consumer_count = __group_size - __producer_count;
-            for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) {
-                init(&__shared_state->__stages[__stage].__consumed, __consumer_count);
-                init(&__shared_state->__stages[__stage].__produced, __producer_count);
-            }
-            __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
-        }
-        __group.sync();
-
-        return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, true);
+      __tail                  = 0;
+      __produced_phase_parity = !__produced_phase_parity;
     }
-
-    template<class _Group, thread_scope _Scope, uint8_t _Stages_count>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state, pipeline_role __role)
+  }
+
+  template <class _Rep, class _Period>
+  _LIBCUDACXX_INLINE_VISIBILITY bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __duration)
+  {
+    barrier<_Scope>& __stage_barrier = __shared_state_get_stage(__tail)->__produced;
+    return _CUDA_VSTD::__libcpp_thread_poll_with_backoff(
+      _CUDA_VSTD::__barrier_poll_tester_parity<barrier<_Scope>>(&__stage_barrier, __produced_phase_parity),
+      _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__duration));
+  }
+
+  template <class _Clock, class _Duration>
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time_point)
+  {
+    return consumer_wait_for(__time_point - _Clock::now());
+  }
+
+private:
+  uint8_t __head               : 8;
+  uint8_t __tail               : 8;
+  const uint8_t __stages_count : 8;
+  bool __consumed_phase_parity : 1;
+  bool __produced_phase_parity : 1;
+  bool __active                : 1;
+  // TODO: Remove partitioned on next ABI break
+  const bool __partitioned : 1;
+  char* const __shared_state;
+
+  _LIBCUDACXX_INLINE_VISIBILITY pipeline(char* __shared_state, uint8_t __stages_count, bool __partitioned)
+      : __head(0)
+      , __tail(0)
+      , __stages_count(__stages_count)
+      , __consumed_phase_parity(true)
+      , __produced_phase_parity(false)
+      , __active(true)
+      , __partitioned(__partitioned)
+      , __shared_state(__shared_state)
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY __pipeline_stage<_Scope>* __shared_state_get_stage(uint8_t __stage)
+  {
+    ptrdiff_t __stage_offset = __stage * sizeof(__pipeline_stage<_Scope>);
+    return reinterpret_cast<__pipeline_stage<_Scope>*>(__shared_state + __stage_offset);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY atomic<uint32_t, _Scope>* __shared_state_get_refcount()
+  {
+    ptrdiff_t __refcount_offset = __stages_count * sizeof(__pipeline_stage<_Scope>);
+    return reinterpret_cast<atomic<uint32_t, _Scope>*>(__shared_state + __refcount_offset);
+  }
+
+  template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
+  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  make_pipeline(const _Group& __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state);
+
+  template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
+  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  make_pipeline(const _Group& __group,
+                pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state,
+                size_t __producer_count);
+
+  template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
+  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope>
+  make_pipeline(const _Group& __group,
+                pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state,
+                pipeline_role __role);
+};
+
+template <class _Group, thread_scope _Scope, uint8_t _Stages_count>
+_LIBCUDACXX_INLINE_VISIBILITY pipeline<_Scope>
+make_pipeline(const _Group& __group, pipeline_shared_state<_Scope, _Stages_count>* __shared_state)
+{
+  const uint32_t __group_size  = static_cast<uint32_t>(__group.size());
+  const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());
+
+  if (__thread_rank == 0)
+  {
+    for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage)
     {
-        const uint32_t __group_size = static_cast<uint32_t>(__group.size());
-        const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());
-
-        if (__thread_rank == 0) {
-            __shared_state->__refcount.store(0, std::memory_order_relaxed);
-        }
-        __group.sync();
-
-        if (__role == pipeline_role::producer) {
-            bool __elected;
-            uint32_t __add_count;
-NV_IF_TARGET(NV_IS_DEVICE,
-            const uint32_t __match_mask = __match_any_sync(__activemask(), reinterpret_cast<uintptr_t>(&__shared_state->__refcount));
-            const uint32_t __elected_id = __ffs(__match_mask) - 1;
-            __elected = (__pipeline_asm_helper::__lane_id() == __elected_id);
-            __add_count = __popc(__match_mask);
-,
-            __elected = true;
-            __add_count = 1;
-)
-            if (__elected) {
-                (void)__shared_state->__refcount.fetch_add(__add_count, std::memory_order_relaxed);
-            }
-        }
-        __group.sync();
-
-        if (__thread_rank == 0) {
-            const uint32_t __producer_count = __shared_state->__refcount.load(std::memory_order_relaxed);
-            const uint32_t __consumer_count = __group_size - __producer_count;
-            for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) {
-                init(&__shared_state->__stages[__stage].__consumed, __consumer_count);
-                init(&__shared_state->__stages[__stage].__produced, __producer_count);
-            }
-            __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
-        }
-        __group.sync();
-
-        return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, true);
+      init(&__shared_state->__stages[__stage].__consumed, __group_size);
+      init(&__shared_state->__stages[__stage].__produced, __group_size);
+    }
+    __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
+  }
+  __group.sync();
+
+  return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, false);
+}
+
+template <class _Group, thread_scope _Scope, uint8_t _Stages_count>
+_LIBCUDACXX_INLINE_VISIBILITY pipeline<_Scope> make_pipeline(
+  const _Group& __group, pipeline_shared_state<_Scope, _Stages_count>* __shared_state, size_t __producer_count)
+{
+  const uint32_t __group_size  = static_cast<uint32_t>(__group.size());
+  const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());
+
+  if (__thread_rank == 0)
+  {
+    const size_t __consumer_count = __group_size - __producer_count;
+    for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage)
+    {
+      init(&__shared_state->__stages[__stage].__consumed, __consumer_count);
+      init(&__shared_state->__stages[__stage].__produced, __producer_count);
+    }
+    __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
+  }
+  __group.sync();
+
+  return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, true);
+}
+
+template <class _Group, thread_scope _Scope, uint8_t _Stages_count>
+_LIBCUDACXX_INLINE_VISIBILITY pipeline<_Scope>
+make_pipeline(const _Group& __group, pipeline_shared_state<_Scope, _Stages_count>* __shared_state, pipeline_role __role)
+{
+  const uint32_t __group_size  = static_cast<uint32_t>(__group.size());
+  const uint32_t __thread_rank = static_cast<uint32_t>(__group.thread_rank());
+
+  if (__thread_rank == 0)
+  {
+    __shared_state->__refcount.store(0, std::memory_order_relaxed);
+  }
+  __group.sync();
+
+  if (__role == pipeline_role::producer)
+  {
+    bool __elected;
+    uint32_t __add_count;
+    NV_IF_TARGET(
+      NV_IS_DEVICE,
+      const uint32_t __match_mask =
+        __match_any_sync(__activemask(), reinterpret_cast<uintptr_t>(&__shared_state->__refcount));
+      const uint32_t __elected_id = __ffs(__match_mask) - 1;
+      __elected                   = (__pipeline_asm_helper::__lane_id() == __elected_id);
+      __add_count                 = __popc(__match_mask);
+      , __elected = true;
+      __add_count = 1;)
+    if (__elected)
+    {
+      (void) __shared_state->__refcount.fetch_add(__add_count, std::memory_order_relaxed);
     }
+  }
+  __group.sync();
+
+  if (__thread_rank == 0)
+  {
+    const uint32_t __producer_count = __shared_state->__refcount.load(std::memory_order_relaxed);
+    const uint32_t __consumer_count = __group_size - __producer_count;
+    for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage)
+    {
+      init(&__shared_state->__stages[__stage].__consumed, __consumer_count);
+      init(&__shared_state->__stages[__stage].__produced, __producer_count);
+    }
+    __shared_state->__refcount.store(__group_size, std::memory_order_relaxed);
+  }
+  __group.sync();
+
+  return pipeline<_Scope>(reinterpret_cast<char*>(__shared_state->__stages), _Stages_count, true);
+}
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE
 
-    template<uint8_t _Prior>
-    _CCCL_DEVICE
-    void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline);
+template <uint8_t _Prior>
+_CCCL_DEVICE void __pipeline_consumer_wait(pipeline<thread_scope_thread>& __pipeline);
 
-    _CCCL_DEVICE
-    inline void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline, uint8_t __prior);
+_CCCL_DEVICE inline void __pipeline_consumer_wait(pipeline<thread_scope_thread>& __pipeline, uint8_t __prior);
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-    template<>
-    class pipeline<thread_scope_thread> {
-    public:
-        pipeline(pipeline &&) = default;
-        pipeline(const pipeline &) = delete;
-        pipeline & operator=(pipeline &&) = delete;
-        pipeline & operator=(const pipeline &) = delete;
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        ~pipeline() {}
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        bool quit()
-        {
-            return true;
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void producer_acquire() {}
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void producer_commit()
-        {
-NV_IF_TARGET(NV_PROVIDES_SM_80,
-            asm volatile ("cp.async.commit_group;");
-            ++__head;
-)
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void consumer_wait()
-        {
-NV_IF_TARGET(NV_PROVIDES_SM_80,
-            if (__head == __tail) {
-                return;
-            }
-
-            const uint8_t __prior = __head - __tail - 1;
-            device::__pipeline_consumer_wait(*this, __prior);
-            ++__tail;
-)
-        }
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        void consumer_release() {}
-
-        template<class _Rep, class _Period>
-        _LIBCUDACXX_INLINE_VISIBILITY
-        bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period> & __duration)
-        {
-            (void)__duration;
-            consumer_wait();
-            return true;
-        }
-
-        template<class _Clock, class _Duration>
-        _LIBCUDACXX_INLINE_VISIBILITY
-        bool consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration> & __time_point)
-        {
-            (void)__time_point;
-            consumer_wait();
-            return true;
-        }
-
-    private:
-        uint8_t __head;
-        uint8_t __tail;
-
-        _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline()
-            : __head(0)
-            , __tail(0)
-        {}
-
-        friend _LIBCUDACXX_INLINE_VISIBILITY inline pipeline<thread_scope_thread> make_pipeline();
-
-        template<uint8_t _Prior>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        void pipeline_consumer_wait_prior(pipeline<thread_scope_thread> & __pipeline);
-
-        template<class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
-        friend _LIBCUDACXX_INLINE_VISIBILITY
-        pipeline<_Pipeline_scope> __make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state);
-    };
+template <>
+class pipeline<thread_scope_thread>
+{
+public:
+  pipeline(pipeline&&)                 = default;
+  pipeline(const pipeline&)            = delete;
+  pipeline& operator=(pipeline&&)      = delete;
+  pipeline& operator=(const pipeline&) = delete;
+
+  _LIBCUDACXX_INLINE_VISIBILITY ~pipeline() {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool quit()
+  {
+    return true;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void producer_acquire() {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY void producer_commit()
+  {
+    NV_IF_TARGET(NV_PROVIDES_SM_80, asm volatile("cp.async.commit_group;"); ++__head;)
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void consumer_wait()
+  {
+    NV_IF_TARGET(
+      NV_PROVIDES_SM_80,
+      if (__head == __tail) { return; }
+
+      const uint8_t __prior = __head - __tail - 1;
+      device::__pipeline_consumer_wait(*this, __prior);
+      ++__tail;)
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void consumer_release() {}
+
+  template <class _Rep, class _Period>
+  _LIBCUDACXX_INLINE_VISIBILITY bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period>& __duration)
+  {
+    (void) __duration;
+    consumer_wait();
+    return true;
+  }
+
+  template <class _Clock, class _Duration>
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration>& __time_point)
+  {
+    (void) __time_point;
+    consumer_wait();
+    return true;
+  }
+
+private:
+  uint8_t __head;
+  uint8_t __tail;
+
+  _LIBCUDACXX_INLINE_VISIBILITY pipeline()
+      : __head(0)
+      , __tail(0)
+  {}
+
+  friend _LIBCUDACXX_INLINE_VISIBILITY inline pipeline<thread_scope_thread> make_pipeline();
+
+  template <uint8_t _Prior>
+  friend _LIBCUDACXX_INLINE_VISIBILITY void pipeline_consumer_wait_prior(pipeline<thread_scope_thread>& __pipeline);
+
+  template <class _Group, thread_scope _Pipeline_scope, uint8_t _Pipeline_stages_count>
+  friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope> __make_pipeline(
+    const _Group& __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count>* __shared_state);
+};
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE
 
-    template<uint8_t _Prior>
-    _CCCL_DEVICE
-    void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline)
-    {
-        (void)__pipeline;
-NV_IF_TARGET(NV_PROVIDES_SM_80,
-        constexpr uint8_t __max_prior = 8;
-
-        asm volatile ("cp.async.wait_group %0;"
-            :
-            : "n"(_Prior < __max_prior ? _Prior : __max_prior));
-)
-    }
-
-    _CCCL_DEVICE
-    inline void __pipeline_consumer_wait(pipeline<thread_scope_thread> & __pipeline, uint8_t __prior)
-    {
-        switch (__prior) {
-        case 0:  device::__pipeline_consumer_wait<0>(__pipeline); break;
-        case 1:  device::__pipeline_consumer_wait<1>(__pipeline); break;
-        case 2:  device::__pipeline_consumer_wait<2>(__pipeline); break;
-        case 3:  device::__pipeline_consumer_wait<3>(__pipeline); break;
-        case 4:  device::__pipeline_consumer_wait<4>(__pipeline); break;
-        case 5:  device::__pipeline_consumer_wait<5>(__pipeline); break;
-        case 6:  device::__pipeline_consumer_wait<6>(__pipeline); break;
-        case 7:  device::__pipeline_consumer_wait<7>(__pipeline); break;
-        default: device::__pipeline_consumer_wait<8>(__pipeline); break;
-        }
-    }
+template <uint8_t _Prior>
+_CCCL_DEVICE void __pipeline_consumer_wait(pipeline<thread_scope_thread>& __pipeline)
+{
+  (void) __pipeline;
+  NV_IF_TARGET(NV_PROVIDES_SM_80, constexpr uint8_t __max_prior = 8;
+
+               asm volatile("cp.async.wait_group %0;"
+                            :
+                            : "n"(_Prior < __max_prior ? _Prior : __max_prior));)
+}
+
+_CCCL_DEVICE inline void __pipeline_consumer_wait(pipeline<thread_scope_thread>& __pipeline, uint8_t __prior)
+{
+  switch (__prior)
+  {
+    case 0:
+      device::__pipeline_consumer_wait<0>(__pipeline);
+      break;
+    case 1:
+      device::__pipeline_consumer_wait<1>(__pipeline);
+      break;
+    case 2:
+      device::__pipeline_consumer_wait<2>(__pipeline);
+      break;
+    case 3:
+      device::__pipeline_consumer_wait<3>(__pipeline);
+      break;
+    case 4:
+      device::__pipeline_consumer_wait<4>(__pipeline);
+      break;
+    case 5:
+      device::__pipeline_consumer_wait<5>(__pipeline);
+      break;
+    case 6:
+      device::__pipeline_consumer_wait<6>(__pipeline);
+      break;
+    case 7:
+      device::__pipeline_consumer_wait<7>(__pipeline);
+      break;
+    default:
+      device::__pipeline_consumer_wait<8>(__pipeline);
+      break;
+  }
+}
 
 _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    inline pipeline<thread_scope_thread> make_pipeline()
-    {
-        return pipeline<thread_scope_thread>();
-    }
-
-    template<uint8_t _Prior>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void pipeline_consumer_wait_prior(pipeline<thread_scope_thread> & __pipeline)
-    {
-        NV_IF_TARGET(NV_PROVIDES_SM_80,
-            device::__pipeline_consumer_wait<_Prior>(__pipeline);
-            __pipeline.__tail = __pipeline.__head - _Prior;
-        )
-    }
-
-    template<thread_scope _Scope>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void pipeline_producer_commit(pipeline<thread_scope_thread> & __pipeline, barrier<_Scope> & __barrier)
-    {
-        (void)__pipeline;
-        NV_IF_TARGET(NV_PROVIDES_SM_80,(
-            (void)__memcpy_completion_impl::__defer(__completion_mechanism::__async_group, __single_thread_group{}, 0, __barrier);
-        ));
-    }
-
-    template<typename _Group, class _Tp, typename _Size, thread_scope _Scope>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    async_contract_fulfillment __memcpy_async_pipeline(_Group const & __group, _Tp * __destination, _Tp const * __source, _Size __size, pipeline<_Scope> & __pipeline) {
-        // 1. Set the completion mechanisms that can be used.
-        //
-        //    Do not (yet) allow async_bulk_group completion. Do not allow
-        //    mbarrier_complete_tx completion, even though it may be possible if
-        //    the pipeline has stage barriers in shared memory.
-        _CUDA_VSTD::uint32_t __allowed_completions = _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group);
-
-        // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t.
-        constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align;
-        constexpr _CUDA_VSTD::size_t __align = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp);
-        // Cast to char pointers. We don't need the type for alignment anymore and
-        // erasing the types reduces the number of instantiations of down-stream
-        // functions.
-        char * __dest_char = reinterpret_cast<char*>(__destination);
-        char const * __src_char = reinterpret_cast<char const *>(__source);
-
-        // 2. Issue actual copy instructions.
-        auto __cm =  __dispatch_memcpy_async<__align>(__group, __dest_char, __src_char, __size, __allowed_completions);
-
-        // 3. No need to synchronize with copy instructions.
-        return __memcpy_completion_impl::__defer(__cm, __group, __size, __pipeline);
-    }
-
-    template<typename _Group, class _Type, thread_scope _Scope>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    async_contract_fulfillment memcpy_async(_Group const & __group, _Type * __destination, _Type const * __source, std::size_t __size, pipeline<_Scope> & __pipeline) {
-        return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
-    }
-
-    template<typename _Group, class _Type, std::size_t _Alignment, thread_scope _Scope, std::size_t _Larger_alignment = (alignof(_Type) > _Alignment) ? alignof(_Type) : _Alignment>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    async_contract_fulfillment memcpy_async(_Group const & __group, _Type * __destination, _Type const * __source, aligned_size_t<_Alignment> __size, pipeline<_Scope> & __pipeline) {
-        return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
-    }
-
-    template<class _Type, typename _Size, thread_scope _Scope>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    async_contract_fulfillment memcpy_async(_Type * __destination, _Type const * __source, _Size __size, pipeline<_Scope> & __pipeline) {
-        return __memcpy_async_pipeline(__single_thread_group{}, __destination, __source, __size, __pipeline);
-    }
-
-    template<typename _Group, thread_scope _Scope>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    async_contract_fulfillment memcpy_async(_Group const & __group, void * __destination, void const * __source, std::size_t __size, pipeline<_Scope> & __pipeline) {
-        return __memcpy_async_pipeline(__group, reinterpret_cast<char *>(__destination), reinterpret_cast<char const *>(__source), __size, __pipeline);
-    }
-
-    template<typename _Group, std::size_t _Alignment, thread_scope _Scope>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    async_contract_fulfillment memcpy_async(_Group const & __group, void * __destination, void const * __source, aligned_size_t<_Alignment> __size, pipeline<_Scope> & __pipeline) {
-        return __memcpy_async_pipeline(__group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const *>(__source), __size, __pipeline);
-    }
-
-    template<typename _Size, thread_scope _Scope>
-    _LIBCUDACXX_INLINE_VISIBILITY
-    async_contract_fulfillment memcpy_async(void * __destination, void const * __source, _Size __size, pipeline<_Scope> & __pipeline) {
-        return __memcpy_async_pipeline(__single_thread_group{}, reinterpret_cast<char*>(__destination), reinterpret_cast<char const *>(__source), __size, __pipeline);
-    }
+_LIBCUDACXX_INLINE_VISIBILITY inline pipeline<thread_scope_thread> make_pipeline()
+{
+  return pipeline<thread_scope_thread>();
+}
+
+template <uint8_t _Prior>
+_LIBCUDACXX_INLINE_VISIBILITY void pipeline_consumer_wait_prior(pipeline<thread_scope_thread>& __pipeline)
+{
+  NV_IF_TARGET(NV_PROVIDES_SM_80, device::__pipeline_consumer_wait<_Prior>(__pipeline);
+               __pipeline.__tail = __pipeline.__head - _Prior;)
+}
+
+template <thread_scope _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY void
+pipeline_producer_commit(pipeline<thread_scope_thread>& __pipeline, barrier<_Scope>& __barrier)
+{
+  (void) __pipeline;
+  NV_IF_TARGET(NV_PROVIDES_SM_80,
+               ((void) __memcpy_completion_impl::__defer(
+                  __completion_mechanism::__async_group, __single_thread_group{}, 0, __barrier);));
+}
+
+template <typename _Group, class _Tp, typename _Size, thread_scope _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment __memcpy_async_pipeline(
+  _Group const& __group, _Tp* __destination, _Tp const* __source, _Size __size, pipeline<_Scope>& __pipeline)
+{
+  // 1. Set the completion mechanisms that can be used.
+  //
+  //    Do not (yet) allow async_bulk_group completion. Do not allow
+  //    mbarrier_complete_tx completion, even though it may be possible if
+  //    the pipeline has stage barriers in shared memory.
+  _CUDA_VSTD::uint32_t __allowed_completions = _CUDA_VSTD::uint32_t(__completion_mechanism::__async_group);
+
+  // Alignment: Use the maximum of the alignment of _Tp and that of a possible cuda::aligned_size_t.
+  constexpr _CUDA_VSTD::size_t __size_align = __get_size_align<_Size>::align;
+  constexpr _CUDA_VSTD::size_t __align      = (alignof(_Tp) < __size_align) ? __size_align : alignof(_Tp);
+  // Cast to char pointers. We don't need the type for alignment anymore and
+  // erasing the types reduces the number of instantiations of down-stream
+  // functions.
+  char* __dest_char      = reinterpret_cast<char*>(__destination);
+  char const* __src_char = reinterpret_cast<char const*>(__source);
+
+  // 2. Issue actual copy instructions.
+  auto __cm = __dispatch_memcpy_async<__align>(__group, __dest_char, __src_char, __size, __allowed_completions);
+
+  // 3. No need to synchronize with copy instructions.
+  return __memcpy_completion_impl::__defer(__cm, __group, __size, __pipeline);
+}
+
+template <typename _Group, class _Type, thread_scope _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+  _Group const& __group, _Type* __destination, _Type const* __source, std::size_t __size, pipeline<_Scope>& __pipeline)
+{
+  return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
+}
+
+template <typename _Group,
+          class _Type,
+          std::size_t _Alignment,
+          thread_scope _Scope,
+          std::size_t _Larger_alignment = (alignof(_Type) > _Alignment) ? alignof(_Type) : _Alignment>
+_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+  _Group const& __group,
+  _Type* __destination,
+  _Type const* __source,
+  aligned_size_t<_Alignment> __size,
+  pipeline<_Scope>& __pipeline)
+{
+  return __memcpy_async_pipeline(__group, __destination, __source, __size, __pipeline);
+}
+
+template <class _Type, typename _Size, thread_scope _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment
+memcpy_async(_Type* __destination, _Type const* __source, _Size __size, pipeline<_Scope>& __pipeline)
+{
+  return __memcpy_async_pipeline(__single_thread_group{}, __destination, __source, __size, __pipeline);
+}
+
+template <typename _Group, thread_scope _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+  _Group const& __group, void* __destination, void const* __source, std::size_t __size, pipeline<_Scope>& __pipeline)
+{
+  return __memcpy_async_pipeline(
+    __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __pipeline);
+}
+
+template <typename _Group, std::size_t _Alignment, thread_scope _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment memcpy_async(
+  _Group const& __group,
+  void* __destination,
+  void const* __source,
+  aligned_size_t<_Alignment> __size,
+  pipeline<_Scope>& __pipeline)
+{
+  return __memcpy_async_pipeline(
+    __group, reinterpret_cast<char*>(__destination), reinterpret_cast<char const*>(__source), __size, __pipeline);
+}
+
+template <typename _Size, thread_scope _Scope>
+_LIBCUDACXX_INLINE_VISIBILITY async_contract_fulfillment
+memcpy_async(void* __destination, void const* __source, _Size __size, pipeline<_Scope>& __pipeline)
+{
+  return __memcpy_async_pipeline(
+    __single_thread_group{},
+    reinterpret_cast<char*>(__destination),
+    reinterpret_cast<char const*>(__source),
+    __size,
+    __pipeline);
+}
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
 
diff --git a/libcudacxx/include/cuda/std/__algorithm_ b/libcudacxx/include/cuda/std/__algorithm_
index 91c4160a8b5..a1762b79ae5 100644
--- a/libcudacxx/include/cuda/std/__algorithm_
+++ b/libcudacxx/include/cuda/std/__algorithm_
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/algorithm>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_ALGORITHM
diff --git a/libcudacxx/include/cuda/std/__exception_ b/libcudacxx/include/cuda/std/__exception_
index e5aedc1d49d..c9b2b855f5c 100644
--- a/libcudacxx/include/cuda/std/__exception_
+++ b/libcudacxx/include/cuda/std/__exception_
@@ -12,11 +12,8 @@
 #define _CUDA_STD_NEW
 
 #include "detail/__config"
-
+#include "detail/__pragma_pop"
 #include "detail/__pragma_push"
-
 #include "detail/libcxx/include/exception"
 
-#include "detail/__pragma_pop"
-
 #endif // _CUDA_STD_NEW
diff --git a/libcudacxx/include/cuda/std/__memory_ b/libcudacxx/include/cuda/std/__memory_
index 1bff78d6773..077c795e1ed 100644
--- a/libcudacxx/include/cuda/std/__memory_
+++ b/libcudacxx/include/cuda/std/__memory_
@@ -12,11 +12,8 @@
 #define _CUDA_STD_MEMORY
 
 #include "detail/__config"
-
+#include "detail/__pragma_pop"
 #include "detail/__pragma_push"
-
 #include "detail/libcxx/include/memory"
 
-#include "detail/__pragma_pop"
-
 #endif // _CUDA_STD_MEMORY
diff --git a/libcudacxx/include/cuda/std/__new_ b/libcudacxx/include/cuda/std/__new_
index 3e8aefcdb6f..daaf0f48084 100644
--- a/libcudacxx/include/cuda/std/__new_
+++ b/libcudacxx/include/cuda/std/__new_
@@ -12,11 +12,8 @@
 #define _CUDA_STD_NEW
 
 #include "detail/__config"
-
+#include "detail/__pragma_pop"
 #include "detail/__pragma_push"
-
 #include "detail/libcxx/include/new"
 
-#include "detail/__pragma_pop"
-
 #endif // _CUDA_STD_NEW
diff --git a/libcudacxx/include/cuda/std/array b/libcudacxx/include/cuda/std/array
index f0bd5785600..4dd41a43020 100644
--- a/libcudacxx/include/cuda/std/array
+++ b/libcudacxx/include/cuda/std/array
@@ -12,10 +12,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/array>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_ARRAY
diff --git a/libcudacxx/include/cuda/std/atomic b/libcudacxx/include/cuda/std/atomic
index 0daab5f2cb5..7908a2274ea 100644
--- a/libcudacxx/include/cuda/std/atomic
+++ b/libcudacxx/include/cuda/std/atomic
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/atomic>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_ATOMIC
diff --git a/libcudacxx/include/cuda/std/barrier b/libcudacxx/include/cuda/std/barrier
index 415c3f80acf..94ab6e65df4 100644
--- a/libcudacxx/include/cuda/std/barrier
+++ b/libcudacxx/include/cuda/std/barrier
@@ -17,10 +17,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/barrier>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_BARRIER
diff --git a/libcudacxx/include/cuda/std/bit b/libcudacxx/include/cuda/std/bit
index 491b346c576..a80f1d5d1df 100644
--- a/libcudacxx/include/cuda/std/bit
+++ b/libcudacxx/include/cuda/std/bit
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/bit>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_BIT
diff --git a/libcudacxx/include/cuda/std/cassert b/libcudacxx/include/cuda/std/cassert
index af8af80e43d..b6400ae2694 100644
--- a/libcudacxx/include/cuda/std/cassert
+++ b/libcudacxx/include/cuda/std/cassert
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/cassert>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_CASSERT
diff --git a/libcudacxx/include/cuda/std/cfloat b/libcudacxx/include/cuda/std/cfloat
index 31a9f8e4e61..13f64607bf3 100644
--- a/libcudacxx/include/cuda/std/cfloat
+++ b/libcudacxx/include/cuda/std/cfloat
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/cfloat>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_CFLOAT
diff --git a/libcudacxx/include/cuda/std/chrono b/libcudacxx/include/cuda/std/chrono
index f8d62efb4f6..38eff65fb16 100644
--- a/libcudacxx/include/cuda/std/chrono
+++ b/libcudacxx/include/cuda/std/chrono
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/chrono>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_CHRONO
diff --git a/libcudacxx/include/cuda/std/climits b/libcudacxx/include/cuda/std/climits
index f7934b665a9..fa981537469 100644
--- a/libcudacxx/include/cuda/std/climits
+++ b/libcudacxx/include/cuda/std/climits
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/climits>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_CLIMITS
diff --git a/libcudacxx/include/cuda/std/cmath b/libcudacxx/include/cuda/std/cmath
index a6a05ef2430..68524be4bad 100644
--- a/libcudacxx/include/cuda/std/cmath
+++ b/libcudacxx/include/cuda/std/cmath
@@ -12,10 +12,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/cmath>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_CMATH
diff --git a/libcudacxx/include/cuda/std/complex b/libcudacxx/include/cuda/std/complex
index 7c8ea6b5b46..4940f7cb2bc 100644
--- a/libcudacxx/include/cuda/std/complex
+++ b/libcudacxx/include/cuda/std/complex
@@ -12,10 +12,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/complex>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_COMPLEX
diff --git a/libcudacxx/include/cuda/std/concepts b/libcudacxx/include/cuda/std/concepts
index d3f9eb25dde..eee16d9b100 100644
--- a/libcudacxx/include/cuda/std/concepts
+++ b/libcudacxx/include/cuda/std/concepts
@@ -12,10 +12,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/concepts>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_CONCEPTS
diff --git a/libcudacxx/include/cuda/std/cstddef b/libcudacxx/include/cuda/std/cstddef
index 95aae77de22..5fe32da86d8 100644
--- a/libcudacxx/include/cuda/std/cstddef
+++ b/libcudacxx/include/cuda/std/cstddef
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/cstddef>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_CSTDDEF
diff --git a/libcudacxx/include/cuda/std/cstdint b/libcudacxx/include/cuda/std/cstdint
index 22c0754e481..f62a90d93ee 100644
--- a/libcudacxx/include/cuda/std/cstdint
+++ b/libcudacxx/include/cuda/std/cstdint
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/cstdint>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_CSTDINT
diff --git a/libcudacxx/include/cuda/std/cstdlib b/libcudacxx/include/cuda/std/cstdlib
index af85815be27..36c3d976657 100644
--- a/libcudacxx/include/cuda/std/cstdlib
+++ b/libcudacxx/include/cuda/std/cstdlib
@@ -12,11 +12,8 @@
 #define _CUDA_STD_CSTDLIB
 
 #include "detail/__config"
-
+#include "detail/__pragma_pop"
 #include "detail/__pragma_push"
-
 #include "detail/libcxx/include/cstdlib"
 
-#include "detail/__pragma_pop"
-
 #endif // _CUDA_STD_CSTDLIB
diff --git a/libcudacxx/include/cuda/std/ctime b/libcudacxx/include/cuda/std/ctime
index d610c831077..72275a6bdf3 100644
--- a/libcudacxx/include/cuda/std/ctime
+++ b/libcudacxx/include/cuda/std/ctime
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/ctime>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_CTIME
diff --git a/libcudacxx/include/cuda/std/detail/__access_property b/libcudacxx/include/cuda/std/detail/__access_property
index 7d9718503e9..c63ec342df9 100644
--- a/libcudacxx/include/cuda/std/detail/__access_property
+++ b/libcudacxx/include/cuda/std/detail/__access_property
@@ -3,325 +3,445 @@
  *
  * NVIDIA SOFTWARE LICENSE
  *
- * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
+ * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the
+ * NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
  *
- * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users.
+ * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used.
+ * If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By
+ * taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of
+ * this license, and you take legal and financial responsibility for the actions of your permitted users.
  *
- * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions.
+ * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law,
+ * regulation or generally accepted practices or guidelines in the relevant jurisdictions.
  *
- * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license.
+ * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install
+ * and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this
+ * license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under
+ * this license.
  *
  * 2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant:
- * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights.
- * b.      You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE.
+ * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license,
+ * including (without limitation) terms relating to the license grant and license restrictions and protection of
+ * NVIDIA’s intellectual property rights. b.      You agree to notify NVIDIA in writing of any known or suspected
+ * distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms
+ * of your agreements with respect to distributed SOFTWARE.
  *
  * 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows:
  * a.      The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs.
- * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE.
- * c.      You may not modify or create derivative works of any portion of the SOFTWARE.
- * d.      You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE.
- * e.      You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge.
- * f.      Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses.
- * g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
+ * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from
+ * any portion of the SOFTWARE or copies of the SOFTWARE. c.      You may not modify or create derivative works of any
+ * portion of the SOFTWARE. d.      You may not bypass, disable, or circumvent any technical measure, encryption,
+ * security, digital rights management or authentication mechanism in the SOFTWARE. e.      You may not use the SOFTWARE
+ * in any manner that would cause it to become subject to an open source software license. As examples, licenses that
+ * require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in
+ * source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. f.
+ * Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or
+ * application where the use or failure of the system or application can reasonably be expected to threaten or result in
+ * personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life
+ * support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these
+ * critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or
+ * damages arising from such uses. g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates,
+ * and their respective employees, contractors, agents, officers and directors, from and against any and all claims,
+ * damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited
+ * to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use
+ * of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
  *
- * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems.
+ * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may
+ * not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy,
+ * availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use
+ * a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in
+ * production or business-critical systems.
  *
- * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE.
+ * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and
+ * exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United
+ * States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time
+ * without notice, but is not obligated to support or update the SOFTWARE.
  *
- * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict.
+ * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal
+ * notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is
+ * a conflict between the terms in this license and the license terms associated with a component, the license terms
+ * associated with the components control only to the extent necessary to resolve the conflict.
  *
- * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice.
+ * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes,
+ * enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you
+ * voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable
+ * license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute
+ * (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA
+ * will use Feedback at its choice.
  *
- * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
+ * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT
+ * NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT
+ * WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR
+ * ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
  *
- * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT.
+ * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE
+ * FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE,
+ * LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH
+ * THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON
+ * BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION
+ * OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE
+ * POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING
+ * OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE
+ * OR EXTEND THIS LIMIT.
  *
- * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you.
+ * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail
+ * to comply with any term and condition of this license or if you commence or participate in any legal proceeding
+ * against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if
+ * NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of
+ * it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of
+ * the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this
+ * license are not affected by the termination of this license. All provisions of this license will survive termination,
+ * except for the license granted to you.
  *
- * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+ * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State
+ * of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware
+ * residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the
+ * International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English
+ * language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction
+ * over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be
+ * allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
  *
- * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect.
+ * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or
+ * operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be
+ * void and of no effect.
  *
- * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE.
+ * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship,
+ * transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States
+ * Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s
+ * Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws
+ * include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not
+ * a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from
+ * receiving the SOFTWARE.
  *
- * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
+ * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting
+ * of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS.
+ * Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the
+ * restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the
+ * Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is
+ * NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
  *
- * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party.
+ * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the
+ * subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to
+ * this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of
+ * this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect.
+ * This license may only be modified in a writing signed by an authorized representative of each party.
  *
  * (v. August 20, 2021)
  */
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-namespace __detail_ap {
+namespace __detail_ap
+{
 
-  _CCCL_HOST_DEVICE
-  constexpr uint32_t __ap_floor_log2(uint32_t __x) {
-    return (__x == 1 | __x == 0) ? 0 : 1 + __ap_floor_log2(__x >> 1);
-  }
+_CCCL_HOST_DEVICE constexpr uint32_t __ap_floor_log2(uint32_t __x)
+{
+  return (__x == 1 | __x == 0) ? 0 : 1 + __ap_floor_log2(__x >> 1);
+}
 
-  _CCCL_HOST_DEVICE
-  constexpr uint32_t __ap_ceil_log2(uint32_t __x) {
-    return (__x == 1 | __x == 0) ? 0 : __ap_floor_log2(__x - 1) + 1;
-  }
+_CCCL_HOST_DEVICE constexpr uint32_t __ap_ceil_log2(uint32_t __x)
+{
+  return (__x == 1 | __x == 0) ? 0 : __ap_floor_log2(__x - 1) + 1;
+}
 
-  _CCCL_HOST_DEVICE
-  constexpr uint32_t __ap_min(uint32_t __a, uint32_t __b) noexcept {
-    return (__a < __b) ? __a : __b;
-  }
+_CCCL_HOST_DEVICE constexpr uint32_t __ap_min(uint32_t __a, uint32_t __b) noexcept
+{
+  return (__a < __b) ? __a : __b;
+}
 
-  _CCCL_HOST_DEVICE
-  constexpr uint32_t __ap_max(uint32_t __a, uint32_t __b) noexcept {
-    return (__a > __b) ? __a : __b;
-  }
+_CCCL_HOST_DEVICE constexpr uint32_t __ap_max(uint32_t __a, uint32_t __b) noexcept
+{
+  return (__a > __b) ? __a : __b;
+}
 
 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414
 // Specifically search for 8.4 and 9.3 and above to guarantee uint64_t enum.
-#if defined(_CCCL_COMPILER_GCC) && ( \
-    ((_GNUC_VER < 804)) || \
-    ((_GNUC_VER < 903))    \
-  )
-# define _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION
+#if defined(_CCCL_COMPILER_GCC) && (((_GNUC_VER < 804)) || ((_GNUC_VER < 903)))
+#  define _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION
 #else
-# define _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION : uint64_t
+#  define _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION : uint64_t
 #endif
 
-  namespace __sm_80 {
-    namespace __off {
-      enum __l2_cop_off_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION {
-        _L2_EVICT_NORMAL = 0,
-        _L2_EVICT_FIRST = 1,
-      };
-    } // namespace __off
-
-    namespace __on {
-      enum __l2_cop_on_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION {
-        _L2_EVICT_NORMAL = 0,
-        _L2_EVICT_FIRST = 1,
-        _L2_EVICT_LAST = 2,
-        _L2_EVICT_NORMAL_DEMOTE = 3,
-      };
-    } // namespace __on
-
-    enum __l2_descriptor_mode_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION {
-      _DESC_IMPLICIT = 0,
-      _DESC_INTERLEAVED = 2,
-      _DESC_BLOCK_TYPE = 3,
-    };
-
-    enum __l2_eviction_max_way_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION {
-      _CUDA_AMPERE_MAX_L2_WAYS = std::uint32_t{16},
-    };
-
-    enum __block_size_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION {
-      _BLOCKSIZE_4K   = 0,
-      _BLOCKSIZE_8K   = 1,
-      _BLOCKSIZE_16K  = 2,
-      _BLOCKSIZE_32K  = 3,
-      _BLOCKSIZE_64K  = 4,
-      _BLOCKSIZE_128K = 5,
-      _BLOCKSIZE_256K = 6,
-      _BLOCKSIZE_512K = 7,
-      _BLOCKSIZE_1M   = 8,
-      _BLOCKSIZE_2M   = 9,
-      _BLOCKSIZE_4M   = 10,
-      _BLOCKSIZE_8M   = 11,
-      _BLOCKSIZE_16M  = 12,
-      _BLOCKSIZE_32M  = 13,
-    };
-
-    struct __block_desc_t {
-      uint64_t __ap_reserved : 37;
-      uint64_t __block_count: 7;
-      uint64_t __block_start: 7;
-      uint64_t __ap_reserved2 : 1;
-      __block_size_t __block_size : 4;
-      __off::__l2_cop_off_t __l2_cop_off : 1;
-      __on::__l2_cop_on_t __l2_cop_on : 2;
-      __l2_descriptor_mode_t __l2_descriptor_mode : 2;
-      uint64_t __l1_inv_dont_allocate : 1;
-      uint64_t __l2_sector_promote_256B : 1;
-      uint64_t __ap_reserved3 : 1;
-
-      _CCCL_HOST_DEVICE
-      constexpr std::uint64_t __get_descriptor_cexpr() const noexcept {
-        return
-          std::uint64_t(__ap_reserved)            << 0  |
-          std::uint64_t(__block_count)            << 37 |
-          std::uint64_t(__block_start)            << 44 |
-          std::uint64_t(__ap_reserved2)           << 51 |
-          std::uint64_t(__block_size)             << 52 |
-          std::uint64_t(__l2_cop_off)             << 56 |
-          std::uint64_t(__l2_cop_on)              << 57 |
-          std::uint64_t(__l2_descriptor_mode)     << 59 |
-          std::uint64_t(__l1_inv_dont_allocate)   << 61 |
-          std::uint64_t(__l2_sector_promote_256B) << 62 |
-          std::uint64_t(__ap_reserved3)           << 63;
-      }
-
-      inline
-      _CCCL_HOST_DEVICE
-      std::uint64_t __get_descriptor_non_cexpr() const noexcept { return *reinterpret_cast<const std::uint64_t*>(this); }
-
-      _CCCL_HOST_DEVICE
-      constexpr std::uint64_t __get_descriptor() const noexcept {
+namespace __sm_80
+{
+namespace __off
+{
+enum __l2_cop_off_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION
+{
+  _L2_EVICT_NORMAL = 0,
+  _L2_EVICT_FIRST  = 1,
+};
+} // namespace __off
+
+namespace __on
+{
+enum __l2_cop_on_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION
+{
+  _L2_EVICT_NORMAL        = 0,
+  _L2_EVICT_FIRST         = 1,
+  _L2_EVICT_LAST          = 2,
+  _L2_EVICT_NORMAL_DEMOTE = 3,
+};
+} // namespace __on
+
+enum __l2_descriptor_mode_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION
+{
+  _DESC_IMPLICIT    = 0,
+  _DESC_INTERLEAVED = 2,
+  _DESC_BLOCK_TYPE  = 3,
+};
+
+enum __l2_eviction_max_way_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION
+{
+  _CUDA_AMPERE_MAX_L2_WAYS = std::uint32_t{16},
+};
+
+enum __block_size_t _LIBCUDACXX_AP_ENUM_TYPE_ANNOTATION
+{
+  _BLOCKSIZE_4K   = 0,
+  _BLOCKSIZE_8K   = 1,
+  _BLOCKSIZE_16K  = 2,
+  _BLOCKSIZE_32K  = 3,
+  _BLOCKSIZE_64K  = 4,
+  _BLOCKSIZE_128K = 5,
+  _BLOCKSIZE_256K = 6,
+  _BLOCKSIZE_512K = 7,
+  _BLOCKSIZE_1M   = 8,
+  _BLOCKSIZE_2M   = 9,
+  _BLOCKSIZE_4M   = 10,
+  _BLOCKSIZE_8M   = 11,
+  _BLOCKSIZE_16M  = 12,
+  _BLOCKSIZE_32M  = 13,
+};
+
+struct __block_desc_t
+{
+  uint64_t __ap_reserved                      : 37;
+  uint64_t __block_count                      : 7;
+  uint64_t __block_start                      : 7;
+  uint64_t __ap_reserved2                     : 1;
+  __block_size_t __block_size                 : 4;
+  __off::__l2_cop_off_t __l2_cop_off          : 1;
+  __on::__l2_cop_on_t __l2_cop_on             : 2;
+  __l2_descriptor_mode_t __l2_descriptor_mode : 2;
+  uint64_t __l1_inv_dont_allocate             : 1;
+  uint64_t __l2_sector_promote_256B           : 1;
+  uint64_t __ap_reserved3                     : 1;
+
+  _CCCL_HOST_DEVICE constexpr std::uint64_t __get_descriptor_cexpr() const noexcept
+  {
+    return std::uint64_t(__ap_reserved) << 0 | std::uint64_t(__block_count) << 37 | std::uint64_t(__block_start) << 44
+         | std::uint64_t(__ap_reserved2) << 51 | std::uint64_t(__block_size) << 52 | std::uint64_t(__l2_cop_off) << 56
+         | std::uint64_t(__l2_cop_on) << 57 | std::uint64_t(__l2_descriptor_mode) << 59
+         | std::uint64_t(__l1_inv_dont_allocate) << 61 | std::uint64_t(__l2_sector_promote_256B) << 62
+         | std::uint64_t(__ap_reserved3) << 63;
+  }
+
+  inline _CCCL_HOST_DEVICE std::uint64_t __get_descriptor_non_cexpr() const noexcept
+  {
+    return *reinterpret_cast<const std::uint64_t*>(this);
+  }
+
+  _CCCL_HOST_DEVICE constexpr std::uint64_t __get_descriptor() const noexcept
+  {
 #if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
-        return cuda::std::is_constant_evaluated() ?
-                    __get_descriptor_cexpr() :
-                    __get_descriptor_non_cexpr();
+    return cuda::std::is_constant_evaluated() ? __get_descriptor_cexpr() : __get_descriptor_non_cexpr();
 #else
-        return __get_descriptor_cexpr();
+    return __get_descriptor_cexpr();
 #endif
-      }
-    };
-    static_assert(sizeof(__block_desc_t) == 8, "__block_desc_t should be 8 bytes");
-    static_assert(sizeof(__block_desc_t) == sizeof(std::uint64_t), "");
-    static_assert(
-      __block_desc_t{(uint64_t)1, (uint64_t)1, (uint64_t)1, (uint64_t)1, __block_size_t::_BLOCKSIZE_8K, __off::_L2_EVICT_FIRST, __on::_L2_EVICT_FIRST, __l2_descriptor_mode_t::_DESC_INTERLEAVED, (uint64_t)1, (uint64_t)1, (uint64_t)1}.__get_descriptor()
-         == 0xF318102000000001, "");
-
-    /* Factory like struct to build a __block_desc_t due to constexpr C++11
-     */
-    struct __block_descriptor_builder { //variable declaration order matters == usage order
-      std::uint32_t __offset;
-      __block_size_t __block_size;
-      std::uint32_t __block_start, __end_hit;
-      std::uint32_t __block_count;
-      __off::__l2_cop_off_t __l2_cop_off;
-      __on::__l2_cop_on_t __l2_cop_on;
-      __l2_descriptor_mode_t __l2_descriptor_mode;
-      bool __l1_inv_dont_allocate, __l2_sector_promote_256B;
-
-      _CCCL_HOST_DEVICE static constexpr std::uint32_t __calc_offset(std::size_t __total_bytes) {
-          return __ap_max(std::uint32_t{12}, static_cast<std::uint32_t>(__ap_ceil_log2(static_cast<uint32_t>(__total_bytes))) - std::uint32_t{7});
-      }
-
-      _CCCL_HOST_DEVICE static constexpr std::uint32_t __calc_block_start(std::uintptr_t __ptr, std::size_t __total_bytes) {
-          return static_cast<uint32_t>(__ptr >> __calc_offset(static_cast<uint32_t>(__total_bytes)));
-      }
-
-      _CCCL_HOST_DEVICE static constexpr std::uint32_t __calc_end_hit(std::uintptr_t __ptr, std::size_t __hit_bytes, std::size_t __total_bytes) {
-          return static_cast<uint32_t>((__ptr + __hit_bytes + (std::uintptr_t{1} << (__calc_offset(static_cast<uint32_t>(__total_bytes)))) - 1) >> __calc_offset(static_cast<uint32_t>(__total_bytes)));
-      }
-
-      _CCCL_HOST_DEVICE constexpr __block_descriptor_builder(std::uintptr_t __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, __on::__l2_cop_on_t __hit_prop, __off::__l2_cop_off_t __miss_prop)
-        : __offset(__calc_offset(__total_bytes))
-        , __block_size(static_cast<__block_size_t>(__calc_offset(__total_bytes) - std::uint32_t{12}))
-        , __block_start(__calc_block_start(__ptr, __total_bytes))
-        , __end_hit(__calc_end_hit(__ptr, __hit_bytes, __total_bytes))
-        , __block_count(__calc_end_hit(__ptr, __hit_bytes, __total_bytes) - __calc_block_start(__ptr, __total_bytes))
-        , __l2_cop_off(__miss_prop)
-        , __l2_cop_on(__hit_prop)
-        , __l2_descriptor_mode(_DESC_BLOCK_TYPE)
-        , __l1_inv_dont_allocate(false)
-        , __l2_sector_promote_256B(false)
-        {}
-
-      _CCCL_HOST_DEVICE
-      constexpr __block_desc_t __get_block() const noexcept {
-        return __block_desc_t { 0, __ap_min(std::uint32_t{0x7f}, __block_count), (__block_start & std::uint32_t{0x7f}), 0, __block_size, __l2_cop_off, __l2_cop_on, _DESC_BLOCK_TYPE, false, false, 0 };
-      }
-    };
-    static_assert(sizeof(std::uintptr_t) > 4, "std::uintptr_t needs at least 5 bytes for this code to work");
-
-    struct __interleave_descriptor_t {
-      uint64_t __ap_reserved : 52;
-      uint64_t __fraction : 4;
-      __off::__l2_cop_off_t __l2_cop_off : 1;
-      __on::__l2_cop_on_t __l2_cop_on : 2;
-      __l2_descriptor_mode_t __l2_descriptor_mode : 2;
-      uint64_t __l1_inv_dont_allocate : 1;
-      uint64_t __l2_sector_promote_256B : 1;
-      uint64_t __ap_reserved2 : 1;
-
-      _CCCL_HOST_DEVICE
-      constexpr __interleave_descriptor_t(
-                    __on::__l2_cop_on_t __hit_prop,
-                    std::uint32_t __hit_ratio,
-                    __off::__l2_cop_off_t __miss_prop) noexcept
-                        : __ap_reserved(0x0),
-                          __fraction(__hit_ratio),
-                          __l2_cop_off(__miss_prop),
-                          __l2_cop_on(__hit_prop),
-                          __l2_descriptor_mode(_DESC_INTERLEAVED),
-                          __l1_inv_dont_allocate(0x0),
-                          __l2_sector_promote_256B(0x0),
-                          __ap_reserved2(0x0) {}
-
-      _CCCL_HOST_DEVICE
-      constexpr std::uint64_t __get_descriptor_cexpr() const {
-        return
-          std::uint64_t(__ap_reserved)            << 0  |
-          std::uint64_t(__fraction)               << 52 |
-          std::uint64_t(__l2_cop_off)             << 56 |
-          std::uint64_t(__l2_cop_on)              << 57 |
-          std::uint64_t(__l2_descriptor_mode)     << 59 |
-          std::uint64_t(__l1_inv_dont_allocate)   << 61 |
-          std::uint64_t(__l2_sector_promote_256B) << 62 |
-          std::uint64_t(__ap_reserved2)           << 63;
-      }
-
-      inline
-      _CCCL_HOST_DEVICE
-      std::uint64_t __get_descriptor_non_cexpr() const noexcept { return *reinterpret_cast<const std::uint64_t*>(this); }
-
-
-      _CCCL_HOST_DEVICE
-      constexpr std::uint64_t __get_descriptor() const noexcept {
+  }
+};
+static_assert(sizeof(__block_desc_t) == 8, "__block_desc_t should be 8 bytes");
+static_assert(sizeof(__block_desc_t) == sizeof(std::uint64_t), "");
+static_assert(
+  __block_desc_t{
+    (uint64_t) 1,
+    (uint64_t) 1,
+    (uint64_t) 1,
+    (uint64_t) 1,
+    __block_size_t::_BLOCKSIZE_8K,
+    __off::_L2_EVICT_FIRST,
+    __on::_L2_EVICT_FIRST,
+    __l2_descriptor_mode_t::_DESC_INTERLEAVED,
+    (uint64_t) 1,
+    (uint64_t) 1,
+    (uint64_t) 1}
+      .__get_descriptor()
+    == 0xF318102000000001,
+  "");
+
+/* Factory like struct to build a __block_desc_t due to constexpr C++11
+ */
+struct __block_descriptor_builder
+{ // variable declaration order matters == usage order
+  std::uint32_t __offset;
+  __block_size_t __block_size;
+  std::uint32_t __block_start, __end_hit;
+  std::uint32_t __block_count;
+  __off::__l2_cop_off_t __l2_cop_off;
+  __on::__l2_cop_on_t __l2_cop_on;
+  __l2_descriptor_mode_t __l2_descriptor_mode;
+  bool __l1_inv_dont_allocate, __l2_sector_promote_256B;
+
+  _CCCL_HOST_DEVICE static constexpr std::uint32_t __calc_offset(std::size_t __total_bytes)
+  {
+    return __ap_max(
+      std::uint32_t{12},
+      static_cast<std::uint32_t>(__ap_ceil_log2(static_cast<uint32_t>(__total_bytes))) - std::uint32_t{7});
+  }
+
+  _CCCL_HOST_DEVICE static constexpr std::uint32_t __calc_block_start(std::uintptr_t __ptr, std::size_t __total_bytes)
+  {
+    return static_cast<uint32_t>(__ptr >> __calc_offset(static_cast<uint32_t>(__total_bytes)));
+  }
+
+  _CCCL_HOST_DEVICE static constexpr std::uint32_t
+  __calc_end_hit(std::uintptr_t __ptr, std::size_t __hit_bytes, std::size_t __total_bytes)
+  {
+    return static_cast<uint32_t>(
+      (__ptr + __hit_bytes + (std::uintptr_t{1} << (__calc_offset(static_cast<uint32_t>(__total_bytes)))) - 1)
+      >> __calc_offset(static_cast<uint32_t>(__total_bytes)));
+  }
+
+  _CCCL_HOST_DEVICE constexpr __block_descriptor_builder(
+    std::uintptr_t __ptr,
+    std::size_t __hit_bytes,
+    std::size_t __total_bytes,
+    __on::__l2_cop_on_t __hit_prop,
+    __off::__l2_cop_off_t __miss_prop)
+      : __offset(__calc_offset(__total_bytes))
+      , __block_size(static_cast<__block_size_t>(__calc_offset(__total_bytes) - std::uint32_t{12}))
+      , __block_start(__calc_block_start(__ptr, __total_bytes))
+      , __end_hit(__calc_end_hit(__ptr, __hit_bytes, __total_bytes))
+      , __block_count(__calc_end_hit(__ptr, __hit_bytes, __total_bytes) - __calc_block_start(__ptr, __total_bytes))
+      , __l2_cop_off(__miss_prop)
+      , __l2_cop_on(__hit_prop)
+      , __l2_descriptor_mode(_DESC_BLOCK_TYPE)
+      , __l1_inv_dont_allocate(false)
+      , __l2_sector_promote_256B(false)
+  {}
+
+  _CCCL_HOST_DEVICE constexpr __block_desc_t __get_block() const noexcept
+  {
+    return __block_desc_t{
+      0,
+      __ap_min(std::uint32_t{0x7f}, __block_count),
+      (__block_start & std::uint32_t{0x7f}),
+      0,
+      __block_size,
+      __l2_cop_off,
+      __l2_cop_on,
+      _DESC_BLOCK_TYPE,
+      false,
+      false,
+      0};
+  }
+};
+static_assert(sizeof(std::uintptr_t) > 4, "std::uintptr_t needs at least 5 bytes for this code to work");
+
+struct __interleave_descriptor_t
+{
+  uint64_t __ap_reserved                      : 52;
+  uint64_t __fraction                         : 4;
+  __off::__l2_cop_off_t __l2_cop_off          : 1;
+  __on::__l2_cop_on_t __l2_cop_on             : 2;
+  __l2_descriptor_mode_t __l2_descriptor_mode : 2;
+  uint64_t __l1_inv_dont_allocate             : 1;
+  uint64_t __l2_sector_promote_256B           : 1;
+  uint64_t __ap_reserved2                     : 1;
+
+  _CCCL_HOST_DEVICE constexpr __interleave_descriptor_t(
+    __on::__l2_cop_on_t __hit_prop, std::uint32_t __hit_ratio, __off::__l2_cop_off_t __miss_prop) noexcept
+      : __ap_reserved(0x0)
+      , __fraction(__hit_ratio)
+      , __l2_cop_off(__miss_prop)
+      , __l2_cop_on(__hit_prop)
+      , __l2_descriptor_mode(_DESC_INTERLEAVED)
+      , __l1_inv_dont_allocate(0x0)
+      , __l2_sector_promote_256B(0x0)
+      , __ap_reserved2(0x0)
+  {}
+
+  _CCCL_HOST_DEVICE constexpr std::uint64_t __get_descriptor_cexpr() const
+  {
+    return std::uint64_t(__ap_reserved) << 0 | std::uint64_t(__fraction) << 52 | std::uint64_t(__l2_cop_off) << 56
+         | std::uint64_t(__l2_cop_on) << 57 | std::uint64_t(__l2_descriptor_mode) << 59
+         | std::uint64_t(__l1_inv_dont_allocate) << 61 | std::uint64_t(__l2_sector_promote_256B) << 62
+         | std::uint64_t(__ap_reserved2) << 63;
+  }
+
+  inline _CCCL_HOST_DEVICE std::uint64_t __get_descriptor_non_cexpr() const noexcept
+  {
+    return *reinterpret_cast<const std::uint64_t*>(this);
+  }
+
+  _CCCL_HOST_DEVICE constexpr std::uint64_t __get_descriptor() const noexcept
+  {
 #if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
-        return cuda::std::is_constant_evaluated() ?
-                    __get_descriptor_cexpr() :
-                    __get_descriptor_non_cexpr();
+    return cuda::std::is_constant_evaluated() ? __get_descriptor_cexpr() : __get_descriptor_non_cexpr();
 #else
-        return __get_descriptor_cexpr();
+    return __get_descriptor_cexpr();
 #endif
-      }
-    };
-    static_assert(sizeof(__interleave_descriptor_t) == 8, "__interleave_descriptor_t should be 8 bytes");
-    static_assert(sizeof(__interleave_descriptor_t) == sizeof(std::uint64_t), "");
-
-    _CCCL_HOST_DEVICE
-    static constexpr std::uint64_t __interleave_normal() noexcept {
-      return 0x10F0000000000000;
-    }
-
-    _CCCL_HOST_DEVICE
-    static constexpr std::uint64_t __interleave_streaming() noexcept {
-      return 0x12F0000000000000;
-    }
-
-    _CCCL_HOST_DEVICE
-    static constexpr std::uint64_t __interleave_persisting() noexcept {
-      return 0x14F0000000000000;
-    }
-
-    _CCCL_HOST_DEVICE
-    static constexpr std::uint64_t __interleave_normal_demote() noexcept {
-      return 0x16F0000000000000;
-    }
-
-  } // namespace __sm_80
-
-  _CCCL_HOST_DEVICE
-  constexpr std::uint64_t __interleave(cudaAccessProperty __hit_prop, float __hit_ratio, cudaAccessProperty __miss_prop = cudaAccessPropertyNormal) {
-      return __sm_80::__interleave_descriptor_t(
-          ((__hit_prop == cudaAccessPropertyNormal) ? __sm_80::__on::__l2_cop_on_t::_L2_EVICT_NORMAL_DEMOTE : static_cast<__sm_80::__on::__l2_cop_on_t>(__hit_prop)),
-          __ap_min((static_cast<std::uint32_t>(__hit_ratio) * __sm_80::__l2_eviction_max_way_t::_CUDA_AMPERE_MAX_L2_WAYS), static_cast<std::uint32_t>(__sm_80::__l2_eviction_max_way_t::_CUDA_AMPERE_MAX_L2_WAYS - 1)),
-          static_cast<__sm_80::__off::__l2_cop_off_t>(__miss_prop)
-          ).__get_descriptor();
-  }
-
-  _CCCL_HOST_DEVICE
-  constexpr std::uint64_t __block(void* __ptr, std::size_t __hit_bytes, std::size_t __total_bytes, cudaAccessProperty __hit_prop, cudaAccessProperty __miss_prop = cudaAccessPropertyNormal) {
-      return (__total_bytes <= (size_t{0xFFFFFFFF}) & __total_bytes != 0 & __hit_bytes <= __total_bytes) ? __sm_80::__block_descriptor_builder(
-          reinterpret_cast<std::uintptr_t>(__ptr),
-          __hit_bytes,
-          __total_bytes,
-          (__hit_prop == cudaAccessPropertyNormal) ? __sm_80::__on::_L2_EVICT_NORMAL_DEMOTE : static_cast<__sm_80::__on::__l2_cop_on_t>(__hit_prop),
-          static_cast<__sm_80::__off::__l2_cop_off_t>(__miss_prop)
-          ).__get_block().__get_descriptor()
-        : __sm_80::__interleave_normal();
   }
+};
+static_assert(sizeof(__interleave_descriptor_t) == 8, "__interleave_descriptor_t should be 8 bytes");
+static_assert(sizeof(__interleave_descriptor_t) == sizeof(std::uint64_t), "");
+
+_CCCL_HOST_DEVICE static constexpr std::uint64_t __interleave_normal() noexcept
+{
+  return 0x10F0000000000000;
+}
+
+_CCCL_HOST_DEVICE static constexpr std::uint64_t __interleave_streaming() noexcept
+{
+  return 0x12F0000000000000;
+}
+
+_CCCL_HOST_DEVICE static constexpr std::uint64_t __interleave_persisting() noexcept
+{
+  return 0x14F0000000000000;
+}
+
+_CCCL_HOST_DEVICE static constexpr std::uint64_t __interleave_normal_demote() noexcept
+{
+  return 0x16F0000000000000;
+}
+
+} // namespace __sm_80
+
+_CCCL_HOST_DEVICE constexpr std::uint64_t __interleave(
+  cudaAccessProperty __hit_prop, float __hit_ratio, cudaAccessProperty __miss_prop = cudaAccessPropertyNormal)
+{
+  return __sm_80::__interleave_descriptor_t(
+           ((__hit_prop == cudaAccessPropertyNormal) ? __sm_80::__on::__l2_cop_on_t::_L2_EVICT_NORMAL_DEMOTE
+                                                     : static_cast<__sm_80::__on::__l2_cop_on_t>(__hit_prop)),
+           __ap_min(
+             (static_cast<std::uint32_t>(__hit_ratio) * __sm_80::__l2_eviction_max_way_t::_CUDA_AMPERE_MAX_L2_WAYS),
+             static_cast<std::uint32_t>(__sm_80::__l2_eviction_max_way_t::_CUDA_AMPERE_MAX_L2_WAYS - 1)),
+           static_cast<__sm_80::__off::__l2_cop_off_t>(__miss_prop))
+    .__get_descriptor();
+}
+
+_CCCL_HOST_DEVICE constexpr std::uint64_t __block(
+  void* __ptr,
+  std::size_t __hit_bytes,
+  std::size_t __total_bytes,
+  cudaAccessProperty __hit_prop,
+  cudaAccessProperty __miss_prop = cudaAccessPropertyNormal)
+{
+  return (__total_bytes <= (size_t{0xFFFFFFFF}) & __total_bytes != 0 & __hit_bytes <= __total_bytes)
+         ? __sm_80::__block_descriptor_builder(
+             reinterpret_cast<std::uintptr_t>(__ptr),
+             __hit_bytes,
+             __total_bytes,
+             (__hit_prop == cudaAccessPropertyNormal)
+               ? __sm_80::__on::_L2_EVICT_NORMAL_DEMOTE
+               : static_cast<__sm_80::__on::__l2_cop_on_t>(__hit_prop),
+             static_cast<__sm_80::__off::__l2_cop_off_t>(__miss_prop))
+             .__get_block()
+             .__get_descriptor()
+         : __sm_80::__interleave_normal();
+}
 } // namespace __detail_ap
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
diff --git a/libcudacxx/include/cuda/std/detail/__annotated_ptr b/libcudacxx/include/cuda/std/detail/__annotated_ptr
index f1d4b166b6e..eb84a309f45 100644
--- a/libcudacxx/include/cuda/std/detail/__annotated_ptr
+++ b/libcudacxx/include/cuda/std/detail/__annotated_ptr
@@ -3,229 +3,327 @@
  *
  * NVIDIA SOFTWARE LICENSE
  *
- * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
+ * This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the
+ * NVIDIA/CUDA C++ Library software and materials provided hereunder (“SOFTWARE”).
  *
- * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users.
+ * This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used.
+ * If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By
+ * taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of
+ * this license, and you take legal and financial responsibility for the actions of your permitted users.
  *
- * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions.
+ * You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law,
+ * regulation or generally accepted practices or guidelines in the relevant jurisdictions.
  *
- * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license.
+ * 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install
+ * and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this
+ * license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under
+ * this license.
  *
  * 2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant:
- * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights.
- * b.      You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE.
+ * a.      The terms under which you distribute the SOFTWARE must be consistent with the terms of this license,
+ * including (without limitation) terms relating to the license grant and license restrictions and protection of
+ * NVIDIA’s intellectual property rights. b.      You agree to notify NVIDIA in writing of any known or suspected
+ * distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms
+ * of your agreements with respect to distributed SOFTWARE.
  *
  * 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows:
  * a.      The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs.
- * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE.
- * c.      You may not modify or create derivative works of any portion of the SOFTWARE.
- * d.      You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE.
- * e.      You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge.
- * f.      Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses.
- * g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
+ * b.      You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from
+ * any portion of the SOFTWARE or copies of the SOFTWARE. c.      You may not modify or create derivative works of any
+ * portion of the SOFTWARE. d.      You may not bypass, disable, or circumvent any technical measure, encryption,
+ * security, digital rights management or authentication mechanism in the SOFTWARE. e.      You may not use the SOFTWARE
+ * in any manner that would cause it to become subject to an open source software license. As examples, licenses that
+ * require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in
+ * source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. f.
+ * Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or
+ * application where the use or failure of the system or application can reasonably be expected to threaten or result in
+ * personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life
+ * support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these
+ * critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or
+ * damages arising from such uses. g.      You agree to defend, indemnify and hold harmless NVIDIA and its affiliates,
+ * and their respective employees, contractors, agents, officers and directors, from and against any and all claims,
+ * damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited
+ * to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to use
+ * of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms.
  *
- * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems.
+ * 4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may
+ * not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy,
+ * availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use
+ * a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in
+ * production or business-critical systems.
  *
- * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE.
+ * 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and
+ * exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United
+ * States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time
+ * without notice, but is not obligated to support or update the SOFTWARE.
  *
- * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict.
+ * 6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal
+ * notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is
+ * a conflict between the terms in this license and the license terms associated with a component, the license terms
+ * associated with the components control only to the extent necessary to resolve the conflict.
  *
- * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice.
+ * 7. FEEDBACK. You may, but don’t have to, provide to NVIDIA any Feedback. “Feedback” means any suggestions, bug fixes,
+ * enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you
+ * voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable
+ * license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute
+ * (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA
+ * will use Feedback at its choice.
  *
- * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
+ * 8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT
+ * NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT
+ * WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR
+ * ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED.
  *
- * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT.
+ * 9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE
+ * FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE,
+ * LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH
+ * THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON
+ * BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION
+ * OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE
+ * POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING
+ * OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE
+ * OR EXTEND THIS LIMIT.
  *
- * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you.
+ * 10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail
+ * to comply with any term and condition of this license or if you commence or participate in any legal proceeding
+ * against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if
+ * NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA’s sole discretion, the continued use of
+ * it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of
+ * the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this
+ * license are not affected by the termination of this license. All provisions of this license will survive termination,
+ * except for the license granted to you.
  *
- * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+ * 11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State
+ * of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware
+ * residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the
+ * International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English
+ * language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction
+ * over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be
+ * allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
  *
- * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect.
+ * 12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or
+ * operation of law without NVIDIA’s permission. Any attempted assignment not approved by NVIDIA in writing shall be
+ * void and of no effect.
  *
- * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE.
+ * 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship,
+ * transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States
+ * Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s
+ * Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws
+ * include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not
+ * a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from
+ * receiving the SOFTWARE.
  *
- * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
+ * 14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is “commercial items” consisting
+ * of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS.
+ * Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the
+ * restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the
+ * Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is
+ * NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
  *
- * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party.
+ * 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the
+ * subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to
+ * this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of
+ * this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect.
+ * This license may only be modified in a writing signed by an authorized representative of each party.
  *
  * (v. August 20, 2021)
  */
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
-namespace __detail_ap {
+namespace __detail_ap
+{
 
-  template <typename _Property>
-  _CCCL_DEVICE
-  void* __associate_address_space(void* __ptr, _Property __prop) {
-    if (std::is_same<_Property, access_property::shared>::value == true) {
-      bool __b = __isShared(__ptr);
-      _LIBCUDACXX_ASSERT(__b, "");
+template <typename _Property>
+_CCCL_DEVICE void* __associate_address_space(void* __ptr, _Property __prop)
+{
+  if (std::is_same<_Property, access_property::shared>::value == true)
+  {
+    bool __b = __isShared(__ptr);
+    _LIBCUDACXX_ASSERT(__b, "");
 #if !defined(_CCCL_CUDACC_BELOW_11_2)
-      __builtin_assume(__b);
+    __builtin_assume(__b);
 #else // ^^^ !_CCCL_CUDACC_BELOW_11_2 ^^^ / vvv _CCCL_CUDACC_BELOW_11_2 vvv
-      (void)__b;
+    (void) __b;
 #endif // _CCCL_CUDACC_BELOW_11_2
-    } else if (std::is_same<_Property, access_property::global>::value == true ||
-               std::is_same<_Property, access_property::normal>::value == true ||
-               std::is_same<_Property, access_property::persisting>::value == true ||
-               std::is_same<_Property, access_property::streaming>::value == true ||
-               std::is_same<_Property, access_property>::value) {
-      bool __b = __isGlobal(__ptr);
-      _LIBCUDACXX_ASSERT(__b, "");
+  }
+  else if (std::is_same<_Property, access_property::global>::value == true
+           || std::is_same<_Property, access_property::normal>::value == true
+           || std::is_same<_Property, access_property::persisting>::value == true
+           || std::is_same<_Property, access_property::streaming>::value == true
+           || std::is_same<_Property, access_property>::value)
+  {
+    bool __b = __isGlobal(__ptr);
+    _LIBCUDACXX_ASSERT(__b, "");
 #if !defined(_CCCL_CUDACC_BELOW_11_2)
-      __builtin_assume(__b);
+    __builtin_assume(__b);
 #else // ^^^ !_CCCL_CUDACC_BELOW_11_2 ^^^ / vvv _CCCL_CUDACC_BELOW_11_2 vvv
-      (void)__b;
+    (void) __b;
 #endif // _CCCL_CUDACC_BELOW_11_2
-    }
+  }
+
+  return __ptr;
+}
+
+template <typename __Prop>
+_CCCL_DEVICE void* __associate_descriptor(void* __ptr, __Prop __prop)
+{
+  return __associate_descriptor(__ptr, static_cast<std::uint64_t>(access_property(__prop)));
+}
+
+template <>
+inline _CCCL_DEVICE void* __associate_descriptor(void* __ptr, std::uint64_t __prop)
+{
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80, (return __nv_associate_access_property(__ptr, __prop);), (return __ptr;))
+}
 
-    return __ptr;
+template <>
+inline _CCCL_DEVICE void* __associate_descriptor(void* __ptr, access_property::shared)
+{
+  return __ptr;
+}
+
+template <typename _Type, typename _Property>
+_CCCL_HOST_DEVICE _Type* __associate(_Type* __ptr, _Property __prop)
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+                    (return static_cast<_Type*>(__associate_descriptor(
+                      __associate_address_space(const_cast<void*>(static_cast<const void*>(__ptr)), __prop), __prop));),
+                    (return __ptr;))
+}
+
+template <typename _Property>
+class __annotated_ptr_base
+{
+  using __error = typename _Property::__unknown_access_property_type;
+};
+
+template <>
+class __annotated_ptr_base<access_property::shared>
+{
+protected:
+  static constexpr std::uint64_t __prop = 0;
+
+  constexpr __annotated_ptr_base() noexcept                                          = default;
+  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::shared) noexcept {}
+  inline _CCCL_DEVICE void* __apply_prop(void* __p) const
+  {
+    return __associate(__p, access_property::shared{});
+  }
+  _CCCL_HOST_DEVICE constexpr access_property::shared __get_property() const noexcept
+  {
+    return access_property::shared{};
   }
+};
 
-  template <typename __Prop>
-  _CCCL_DEVICE
-  void* __associate_descriptor(void* __ptr, __Prop __prop) {
-    return __associate_descriptor(__ptr, static_cast<std::uint64_t>(access_property(__prop)));
+template <>
+class __annotated_ptr_base<access_property::global>
+{
+protected:
+  static constexpr std::uint64_t __prop = __sm_80::__interleave_normal();
+
+  constexpr __annotated_ptr_base() noexcept                                          = default;
+  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::global) noexcept {}
+  inline _CCCL_DEVICE void* __apply_prop(void* __p) const
+  {
+    return __associate(__p, access_property::global{});
+  }
+  _CCCL_HOST_DEVICE constexpr access_property::global __get_property() const noexcept
+  {
+    return access_property::global{};
   }
+};
+
+template <>
+class __annotated_ptr_base<access_property::normal>
+{
+protected:
+  static constexpr std::uint64_t __prop = __sm_80::__interleave_normal_demote();
 
-  template <>
-  inline _CCCL_DEVICE
-  void* __associate_descriptor(void* __ptr, std::uint64_t __prop) {
-    NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
-      return __nv_associate_access_property(__ptr, __prop);
-    ),(
-      return __ptr;
-    ))
+  constexpr __annotated_ptr_base() noexcept                                          = default;
+  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::normal) noexcept {}
+  inline _CCCL_DEVICE void* __apply_prop(void* __p) const
+  {
+    return __associate(__p, access_property::normal{});
   }
+  _CCCL_HOST_DEVICE constexpr access_property::normal __get_property() const noexcept
+  {
+    return access_property::normal{};
+  }
+};
+
+template <>
+class __annotated_ptr_base<access_property::persisting>
+{
+protected:
+  static constexpr std::uint64_t __prop = __sm_80::__interleave_persisting();
 
-  template<>
-  inline _CCCL_DEVICE
-  void* __associate_descriptor(void* __ptr, access_property::shared) {
-    return __ptr;
+  constexpr __annotated_ptr_base() noexcept                                          = default;
+  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::persisting) noexcept {}
+  inline _CCCL_DEVICE void* __apply_prop(void* __p) const
+  {
+    return __associate(__p, access_property::persisting{});
   }
+  _CCCL_HOST_DEVICE constexpr access_property::persisting __get_property() const noexcept
+  {
+    return access_property::persisting{};
+  }
+};
+
+template <>
+class __annotated_ptr_base<access_property::streaming>
+{
+protected:
+  static constexpr std::uint64_t __prop = __sm_80::__interleave_streaming();
 
-  template<typename _Type, typename _Property>
-  _CCCL_HOST_DEVICE
-  _Type* __associate(_Type* __ptr, _Property __prop) {
-    NV_IF_ELSE_TARGET(NV_IS_DEVICE,(
-      return static_cast<_Type*>(__associate_descriptor(
-        __associate_address_space(const_cast<void*>(static_cast<const void*>(__ptr)), __prop),
-        __prop));
-    ),(
-      return __ptr;
-    ))
+  constexpr __annotated_ptr_base() noexcept                                          = default;
+  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::streaming) noexcept {}
+  inline _CCCL_DEVICE void* __apply_prop(void* __p) const
+  {
+    return __associate(__p, access_property::streaming{});
+  }
+  _CCCL_HOST_DEVICE constexpr access_property::streaming __get_property() const noexcept
+  {
+    return access_property::streaming{};
   }
+};
 
+template <>
+class __annotated_ptr_base<access_property>
+{
+protected:
+  std::uint64_t __prop;
 
-  template<typename _Property>
-  class __annotated_ptr_base {
-    using __error = typename _Property::__unknown_access_property_type;
-  };
-
-  template<>
-  class __annotated_ptr_base<access_property::shared> {
-    protected:
-      static constexpr std::uint64_t __prop = 0;
-
-      constexpr __annotated_ptr_base() noexcept = default;
-      constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
-      _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::shared) noexcept {}
-      inline _CCCL_DEVICE void* __apply_prop(void* __p) const {
-        return __associate(__p, access_property::shared{});
-      }
-      _CCCL_HOST_DEVICE constexpr access_property::shared __get_property() const noexcept {
-	return access_property::shared{};
-      }
-  };
-
-  template<>
-  class __annotated_ptr_base<access_property::global> {
-    protected:
-      static constexpr std::uint64_t __prop = __sm_80::__interleave_normal();
-
-      constexpr __annotated_ptr_base() noexcept = default;
-      constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
-      _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::global) noexcept {}
-      inline _CCCL_DEVICE void* __apply_prop(void* __p) const {
-        return __associate(__p, access_property::global{});
-      }
-      _CCCL_HOST_DEVICE constexpr access_property::global __get_property() const noexcept {
-	return access_property::global{};
-      }
-  };
-
-  template<>
-  class __annotated_ptr_base<access_property::normal> {
-    protected:
-      static constexpr std::uint64_t __prop = __sm_80::__interleave_normal_demote();
-
-      constexpr __annotated_ptr_base() noexcept = default;
-      constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
-      _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::normal) noexcept {}
-      inline _CCCL_DEVICE void* __apply_prop(void* __p) const {
-        return __associate(__p, access_property::normal{});
-      }
-      _CCCL_HOST_DEVICE constexpr access_property::normal __get_property() const noexcept {
-	return access_property::normal{};
-      }
-  };
-
-  template<>
-  class __annotated_ptr_base<access_property::persisting> {
-    protected:
-      static constexpr std::uint64_t __prop = __sm_80::__interleave_persisting();
-
-      constexpr __annotated_ptr_base() noexcept = default;
-      constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
-      _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::persisting) noexcept {}
-      inline _CCCL_DEVICE void* __apply_prop(void* __p) const {
-        return __associate(__p, access_property::persisting{});
-      }
-      _CCCL_HOST_DEVICE constexpr access_property::persisting __get_property() const noexcept {
-	return access_property::persisting{};
-      }
-  };
-
-  template<>
-  class __annotated_ptr_base<access_property::streaming> {
-    protected:
-      static constexpr std::uint64_t __prop = __sm_80::__interleave_streaming();
-
-      constexpr __annotated_ptr_base() noexcept = default;
-      constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
-      _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property::streaming) noexcept {}
-      inline _CCCL_DEVICE void* __apply_prop(void* __p) const {
-        return __associate(__p, access_property::streaming{});
-      }
-      _CCCL_HOST_DEVICE constexpr access_property::streaming __get_property() const noexcept {
-	return access_property::streaming{};
-      }
-  };
-
-  template<>
-  class __annotated_ptr_base<access_property> {
-    protected:
-      std::uint64_t __prop;
-
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base() noexcept : __prop(access_property()) {}
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(std::uint64_t __property) noexcept : __prop(__property) {}
-      _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property __property) noexcept
-        : __annotated_ptr_base(static_cast<std::uint64_t>(__property)) {}
-      constexpr __annotated_ptr_base(__annotated_ptr_base const&) = default;
-      _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
-      inline _CCCL_DEVICE void* __apply_prop(void* __p) const {
-        return __associate(__p, __prop);
-      }
-      _CCCL_HOST_DEVICE access_property __get_property() const noexcept {
-	return reinterpret_cast<access_property&>(const_cast<std::uint64_t&>(__prop));
-      }
-  };
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base() noexcept
+      : __prop(access_property())
+  {}
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(std::uint64_t __property) noexcept
+      : __prop(__property)
+  {}
+  _CCCL_HOST_DEVICE constexpr __annotated_ptr_base(access_property __property) noexcept
+      : __annotated_ptr_base(static_cast<std::uint64_t>(__property))
+  {}
+  constexpr __annotated_ptr_base(__annotated_ptr_base const&)                        = default;
+  _CCCL_CONSTEXPR_CXX14 __annotated_ptr_base& operator=(const __annotated_ptr_base&) = default;
+  inline _CCCL_DEVICE void* __apply_prop(void* __p) const
+  {
+    return __associate(__p, __prop);
+  }
+  _CCCL_HOST_DEVICE access_property __get_property() const noexcept
+  {
+    return reinterpret_cast<access_property&>(const_cast<std::uint64_t&>(__prop));
+  }
+};
 } // namespace __detail_ap
 
 _LIBCUDACXX_END_NAMESPACE_CUDA
diff --git a/libcudacxx/include/cuda/std/detail/__config b/libcudacxx/include/cuda/std/detail/__config
index f4fba1f24d6..aaa22d7cf6d 100644
--- a/libcudacxx/include/cuda/std/detail/__config
+++ b/libcudacxx/include/cuda/std/detail/__config
@@ -13,7 +13,7 @@
 
 #include <cuda/std/__cccl/version.h>
 
-#define _LIBCUDACXX_CUDA_API_VERSION CCCL_VERSION
+#define _LIBCUDACXX_CUDA_API_VERSION       CCCL_VERSION
 #define _LIBCUDACXX_CUDA_API_VERSION_MAJOR CCCL_MAJOR_VERSION
 #define _LIBCUDACXX_CUDA_API_VERSION_MINOR CCCL_MINOR_VERSION
 #define _LIBCUDACXX_CUDA_API_VERSION_PATCH CCCL_PATCH_VERSION
diff --git a/libcudacxx/include/cuda/std/detail/__pragma_push b/libcudacxx/include/cuda/std/detail/__pragma_push
index 5042010790d..d7decfb316a 100644
--- a/libcudacxx/include/cuda/std/detail/__pragma_push
+++ b/libcudacxx/include/cuda/std/detail/__pragma_push
@@ -8,5 +8,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/std/detail/libcxx/include/__pragma_push>
 #include <cuda/std/detail/libcxx/include/__undef_macros>
+
+#include <cuda/std/detail/libcxx/include/__pragma_push>
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__assert b/libcudacxx/include/cuda/std/detail/libcxx/include/__assert
index ad54f46dfd6..3568b3b746f 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__assert
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__assert
@@ -27,28 +27,28 @@
 // assertions through the Debug mode previously.
 // TODO: In LLVM 16, make it an error to define _LIBCUDACXX_DEBUG
 #if defined(_LIBCUDACXX_DEBUG)
-# ifndef _LIBCUDACXX_ENABLE_ASSERTIONS
-#   define _LIBCUDACXX_ENABLE_ASSERTIONS 1
-# endif
+#  ifndef _LIBCUDACXX_ENABLE_ASSERTIONS
+#    define _LIBCUDACXX_ENABLE_ASSERTIONS 1
+#  endif
 #endif
 
 // Automatically enable assertions when the debug mode is enabled.
 #if defined(_LIBCUDACXX_ENABLE_DEBUG_MODE)
-# ifndef _LIBCUDACXX_ENABLE_ASSERTIONS
-#   define _LIBCUDACXX_ENABLE_ASSERTIONS 1
-# endif
+#  ifndef _LIBCUDACXX_ENABLE_ASSERTIONS
+#    define _LIBCUDACXX_ENABLE_ASSERTIONS 1
+#  endif
 #endif
 
 #ifndef _LIBCUDACXX_ENABLE_ASSERTIONS
-# define _LIBCUDACXX_ENABLE_ASSERTIONS _LIBCUDACXX_ENABLE_ASSERTIONS_DEFAULT
+#  define _LIBCUDACXX_ENABLE_ASSERTIONS _LIBCUDACXX_ENABLE_ASSERTIONS_DEFAULT
 #endif
 
 #if _LIBCUDACXX_ENABLE_ASSERTIONS != 0 && _LIBCUDACXX_ENABLE_ASSERTIONS != 1
-# error "_LIBCUDACXX_ENABLE_ASSERTIONS must be set to 0 or 1"
+#  error "_LIBCUDACXX_ENABLE_ASSERTIONS must be set to 0 or 1"
 #endif
 
 #if _LIBCUDACXX_ENABLE_ASSERTIONS
-# define _LIBCUDACXX_ASSERT(expression, message)                                \
+#  define _LIBCUDACXX_ASSERT(expression, message)        \
     (_CCCL_DIAG_PUSH                                                            \
     _CCCL_DIAG_SUPPRESS_CLANG("-Wassume")                                       \
     __builtin_expect(static_cast<bool>(expression), 1) ?                        \
@@ -56,13 +56,11 @@
       ::_CUDA_VSTD::__libcpp_verbose_abort("%s:%d: assertion %s failed: %s", __FILE__, __LINE__, #expression, message)
     _CCCL_DIAG_POP)
 #elif 0 // !defined(_LIBCUDACXX_ASSERTIONS_DISABLE_ASSUME) && __has_builtin(__builtin_assume)
-# define _LIBCUDACXX_ASSERT(expression, message)                                \
-    (_CCCL_DIAG_PUSH                                                            \
-    _CCCL_DIAG_SUPPRESS_CLANG("-Wassume")                                       \
-    __builtin_assume(static_cast<bool>(expression))                             \
-    _CCCL_DIAG_POP)
+#  define _LIBCUDACXX_ASSERT(expression, message)                                                          \
+    (_CCCL_DIAG_PUSH _CCCL_DIAG_SUPPRESS_CLANG("-Wassume") __builtin_assume(static_cast<bool>(expression)) \
+       _CCCL_DIAG_POP)
 #else
-# define _LIBCUDACXX_ASSERT(expression, message) ((void)0)
+#  define _LIBCUDACXX_ASSERT(expression, message) ((void) 0)
 #endif
 
 #endif // _LIBCUDACXX___ASSERT
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__availability b/libcudacxx/include/cuda/std/detail/libcxx/include/__availability
index 37ac58934ea..f89d2abf1a0 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__availability
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__availability
@@ -63,226 +63,230 @@
 //
 // [1]: https://clang.llvm.org/docs/AttributeReference.html#availability
 
-
 // For backwards compatibility, allow users to define _LIBCUDACXX_DISABLE_AVAILABILITY
 // for a while.
 #if defined(_LIBCUDACXX_DISABLE_AVAILABILITY)
-#   if !defined(_LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
-#       define _LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
-#   endif
+#  if !defined(_LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
+#    define _LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
+#  endif
 #endif
 
 // Availability markup is disabled when building the library, or when the compiler
 // doesn't support the proper attributes.
-#if defined(_LIBCUDACXX_BUILDING_LIBRARY) ||                                        \
-    defined(_LIBCXXABI_BUILDING_LIBRARY) ||                                     \
-    !__has_feature(attribute_availability_with_strict) ||                       \
-    !__has_feature(attribute_availability_in_templates) ||                      \
-    !__has_extension(pragma_clang_attribute_external_declaration)
-#   if !defined(_LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
-#       define _LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
-#   endif
+#if defined(_LIBCUDACXX_BUILDING_LIBRARY) || defined(_LIBCXXABI_BUILDING_LIBRARY)                              \
+  || !__has_feature(attribute_availability_with_strict) || !__has_feature(attribute_availability_in_templates) \
+  || !__has_extension(pragma_clang_attribute_external_declaration)
+#  if !defined(_LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
+#    define _LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
+#  endif
 #endif
 
 #if defined(_LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
 
-    // This controls the availability of std::shared_mutex and std::shared_timed_mutex,
-    // which were added to the dylib later.
-#   define _LIBCUDACXX_AVAILABILITY_SHARED_MUTEX
+// This controls the availability of std::shared_mutex and std::shared_timed_mutex,
+// which were added to the dylib later.
+#  define _LIBCUDACXX_AVAILABILITY_SHARED_MUTEX
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex
 
-    // These macros control the availability of std::bad_optional_access and
-    // other exception types. These were put in the shared library to prevent
-    // code bloat from every user program defining the vtable for these exception
-    // types.
-    //
-    // Note that when exceptions are disabled, the methods that normally throw
-    // these exceptions can be used even on older deployment targets, but those
-    // methods will abort instead of throwing.
-#   define _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
-#   define _LIBCUDACXX_AVAILABILITY_BAD_VARIANT_ACCESS
-#   define _LIBCUDACXX_AVAILABILITY_BAD_ANY_CAST
-
-    // This controls the availability of std::uncaught_exceptions().
-#   define _LIBCUDACXX_AVAILABILITY_UNCAUGHT_EXCEPTIONS
-
-    // This controls the availability of the sized version of ::operator delete,
-    // ::operator delete[], and their align_val_t variants, which were all added
-    // in C++17, and hence not present in early dylibs.
-#   define _LIBCUDACXX_AVAILABILITY_SIZED_NEW_DELETE
-
-    // This controls the availability of the std::future_error exception.
-    //
-    // Note that when exceptions are disabled, the methods that normally throw
-    // std::future_error can be used even on older deployment targets, but those
-    // methods will abort instead of throwing.
-#   define _LIBCUDACXX_AVAILABILITY_FUTURE_ERROR
-
-    // This controls the availability of std::type_info's vtable.
-    // I can't imagine how using std::type_info can work at all if
-    // this isn't supported.
-#   define _LIBCUDACXX_AVAILABILITY_TYPEINFO_VTABLE
-
-    // This controls the availability of std::locale::category members
-    // (e.g. std::locale::collate), which are defined in the dylib.
-#   define _LIBCUDACXX_AVAILABILITY_LOCALE_CATEGORY
-
-    // This controls the availability of atomic operations on std::shared_ptr
-    // (e.g. `std::atomic_store(std::shared_ptr)`), which require a shared
-    // lock table located in the dylib.
-#   define _LIBCUDACXX_AVAILABILITY_ATOMIC_SHARED_PTR
-
-    // These macros control the availability of all parts of <filesystem> that
-    // depend on something in the dylib.
-#   define _LIBCUDACXX_AVAILABILITY_FILESYSTEM
-#   define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_PUSH
-#   define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_POP
+// These macros control the availability of std::bad_optional_access and
+// other exception types. These were put in the shared library to prevent
+// code bloat from every user program defining the vtable for these exception
+// types.
+//
+// Note that when exceptions are disabled, the methods that normally throw
+// these exceptions can be used even on older deployment targets, but those
+// methods will abort instead of throwing.
+#  define _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
+#  define _LIBCUDACXX_AVAILABILITY_BAD_VARIANT_ACCESS
+#  define _LIBCUDACXX_AVAILABILITY_BAD_ANY_CAST
+
+// This controls the availability of std::uncaught_exceptions().
+#  define _LIBCUDACXX_AVAILABILITY_UNCAUGHT_EXCEPTIONS
+
+// This controls the availability of the sized version of ::operator delete,
+// ::operator delete[], and their align_val_t variants, which were all added
+// in C++17, and hence not present in early dylibs.
+#  define _LIBCUDACXX_AVAILABILITY_SIZED_NEW_DELETE
+
+// This controls the availability of the std::future_error exception.
+//
+// Note that when exceptions are disabled, the methods that normally throw
+// std::future_error can be used even on older deployment targets, but those
+// methods will abort instead of throwing.
+#  define _LIBCUDACXX_AVAILABILITY_FUTURE_ERROR
+
+// This controls the availability of std::type_info's vtable.
+// I can't imagine how using std::type_info can work at all if
+// this isn't supported.
+#  define _LIBCUDACXX_AVAILABILITY_TYPEINFO_VTABLE
+
+// This controls the availability of std::locale::category members
+// (e.g. std::locale::collate), which are defined in the dylib.
+#  define _LIBCUDACXX_AVAILABILITY_LOCALE_CATEGORY
+
+// This controls the availability of atomic operations on std::shared_ptr
+// (e.g. `std::atomic_store(std::shared_ptr)`), which require a shared
+// lock table located in the dylib.
+#  define _LIBCUDACXX_AVAILABILITY_ATOMIC_SHARED_PTR
+
+// These macros control the availability of all parts of <filesystem> that
+// depend on something in the dylib.
+#  define _LIBCUDACXX_AVAILABILITY_FILESYSTEM
+#  define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_PUSH
+#  define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_POP
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem
 
-    // This controls the availability of floating-point std::to_chars functions.
-    // These overloads were added later than the integer overloads.
-#   define _LIBCUDACXX_AVAILABILITY_TO_CHARS_FLOATING_POINT
+// This controls the availability of floating-point std::to_chars functions.
+// These overloads were added later than the integer overloads.
+#  define _LIBCUDACXX_AVAILABILITY_TO_CHARS_FLOATING_POINT
 
-    // This controls the availability of the C++20 synchronization library,
-    // which requires shared library support for various operations
-    // (see libcxx/src/atomic.cpp). This includes <barier>, <latch>,
-    // <semaphore>, and notification functions on std::atomic.
-#   define _LIBCUDACXX_AVAILABILITY_SYNC
+// This controls the availability of the C++20 synchronization library,
+// which requires shared library support for various operations
+// (see libcxx/src/atomic.cpp). This includes <barier>, <latch>,
+// <semaphore>, and notification functions on std::atomic.
+#  define _LIBCUDACXX_AVAILABILITY_SYNC
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_latch
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore
 
-    // This controls the availability of the C++20 format library.
-    // The library is in development and not ABI stable yet. P2216 is
-    // retroactively accepted in C++20. This paper contains ABI breaking
-    // changes.
-#   define _LIBCUDACXX_AVAILABILITY_FORMAT
+// This controls the availability of the C++20 format library.
+// The library is in development and not ABI stable yet. P2216 is
+// retroactively accepted in C++20. This paper contains ABI breaking
+// changes.
+#  define _LIBCUDACXX_AVAILABILITY_FORMAT
 // #   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_format
 
-    // This controls whether the default verbose termination function is
-    // provided by the library.
-    //
-    // Note that when users provide their own custom function, it doesn't
-    // matter whether the dylib provides a default function, and the
-    // availability markup can actually give a false positive diagnostic
-    // (it will think that no function is provided, when in reality the
-    // user has provided their own).
-    //
-    // Users can pass -D_LIBCUDACXX_AVAILABILITY_CUSTOM_VERBOSE_ABORT_PROVIDED
-    // to the compiler to tell the library not to define its own verbose abort.
-    // Note that defining this macro but failing to define a custom function
-    // will lead to a load-time error on back-deployment targets, so it should
-    // be avoided.
+// This controls whether the default verbose termination function is
+// provided by the library.
+//
+// Note that when users provide their own custom function, it doesn't
+// matter whether the dylib provides a default function, and the
+// availability markup can actually give a false positive diagnostic
+// (it will think that no function is provided, when in reality the
+// user has provided their own).
+//
+// Users can pass -D_LIBCUDACXX_AVAILABILITY_CUSTOM_VERBOSE_ABORT_PROVIDED
+// to the compiler to tell the library not to define its own verbose abort.
+// Note that defining this macro but failing to define a custom function
+// will lead to a load-time error on back-deployment targets, so it should
+// be avoided.
 // #   define _LIBCUDACXX_HAS_NO_VERBOSE_ABORT_IN_LIBRARY
 
 #elif defined(__APPLE__)
 
-#   define _LIBCUDACXX_AVAILABILITY_SHARED_MUTEX                                    \
-        __attribute__((availability(macos,strict,introduced=10.12)))            \
-        __attribute__((availability(ios,strict,introduced=10.0)))               \
-        __attribute__((availability(tvos,strict,introduced=10.0)))              \
-        __attribute__((availability(watchos,strict,introduced=3.0)))
-#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101200) ||    \
-        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 100000) || \
-        (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 100000) ||         \
-        (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 30000)
-#       define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex
-#       define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex
-#   endif
-
-        // Note: bad_optional_access & friends were not introduced in the matching
-        // macOS and iOS versions, so the version mismatch between macOS and others
-        // is intended.
-#   define _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS                             \
-        __attribute__((availability(macos,strict,introduced=10.13)))            \
-        __attribute__((availability(ios,strict,introduced=12.0)))               \
-        __attribute__((availability(tvos,strict,introduced=12.0)))              \
-        __attribute__((availability(watchos,strict,introduced=5.0)))
-#   define _LIBCUDACXX_AVAILABILITY_BAD_VARIANT_ACCESS                              \
-        _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
-#   define _LIBCUDACXX_AVAILABILITY_BAD_ANY_CAST                                    \
-        _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
-
-#   define _LIBCUDACXX_AVAILABILITY_UNCAUGHT_EXCEPTIONS                             \
-        __attribute__((availability(macos,strict,introduced=10.12)))            \
-        __attribute__((availability(ios,strict,introduced=10.0)))               \
-        __attribute__((availability(tvos,strict,introduced=10.0)))              \
-        __attribute__((availability(watchos,strict,introduced=3.0)))
-
-#   define _LIBCUDACXX_AVAILABILITY_SIZED_NEW_DELETE                                \
-        __attribute__((availability(macos,strict,introduced=10.12)))            \
-        __attribute__((availability(ios,strict,introduced=10.0)))               \
-        __attribute__((availability(tvos,strict,introduced=10.0)))              \
-        __attribute__((availability(watchos,strict,introduced=3.0)))
-
-#   define _LIBCUDACXX_AVAILABILITY_FUTURE_ERROR                                    \
-        __attribute__((availability(ios,strict,introduced=6.0)))
-
-#   define _LIBCUDACXX_AVAILABILITY_TYPEINFO_VTABLE                                 \
-        __attribute__((availability(macos,strict,introduced=10.9)))             \
-        __attribute__((availability(ios,strict,introduced=7.0)))
-
-#   define _LIBCUDACXX_AVAILABILITY_LOCALE_CATEGORY                                 \
-        __attribute__((availability(macos,strict,introduced=10.9)))             \
-        __attribute__((availability(ios,strict,introduced=7.0)))
-
-#   define _LIBCUDACXX_AVAILABILITY_ATOMIC_SHARED_PTR                               \
-        __attribute__((availability(macos,strict,introduced=10.9)))             \
-        __attribute__((availability(ios,strict,introduced=7.0)))
-
-#   define _LIBCUDACXX_AVAILABILITY_FILESYSTEM                                      \
-        __attribute__((availability(macos,strict,introduced=10.15)))            \
-        __attribute__((availability(ios,strict,introduced=13.0)))               \
-        __attribute__((availability(tvos,strict,introduced=13.0)))              \
-        __attribute__((availability(watchos,strict,introduced=6.0)))
-#   define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_PUSH                                 \
-        _Pragma("clang attribute push(__attribute__((availability(macos,strict,introduced=10.15))), apply_to=any(function,record))") \
-        _Pragma("clang attribute push(__attribute__((availability(ios,strict,introduced=13.0))), apply_to=any(function,record))")    \
-        _Pragma("clang attribute push(__attribute__((availability(tvos,strict,introduced=13.0))), apply_to=any(function,record))")   \
-        _Pragma("clang attribute push(__attribute__((availability(watchos,strict,introduced=6.0))), apply_to=any(function,record))")
-#   define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_POP                                  \
-        _Pragma("clang attribute pop")                                          \
-        _Pragma("clang attribute pop")                                          \
-        _Pragma("clang attribute pop")                                          \
-        _Pragma("clang attribute pop")
-#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500) ||    \
-        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000) || \
-        (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 130000) ||         \
-        (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 60000)
-#       define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem
-#   endif
-
-#   define _LIBCUDACXX_AVAILABILITY_TO_CHARS_FLOATING_POINT                         \
-        __attribute__((unavailable))
-
-#   define _LIBCUDACXX_AVAILABILITY_SYNC                                            \
-        __attribute__((availability(macos,strict,introduced=11.0)))             \
-        __attribute__((availability(ios,strict,introduced=14.0)))               \
-        __attribute__((availability(tvos,strict,introduced=14.0)))              \
-        __attribute__((availability(watchos,strict,introduced=7.0)))
-#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 110000) ||    \
-        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 140000) || \
-        (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 140000) ||         \
-        (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 70000)
-#       define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait
-#       define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier
-#       define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_latch
-#       define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore
-#   endif
-
-#   define _LIBCUDACXX_AVAILABILITY_FORMAT                                          \
-        __attribute__((unavailable))
-#   define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_format
-
-#   define _LIBCUDACXX_HAS_NO_VERBOSE_ABORT_IN_LIBRARY
+#  define _LIBCUDACXX_AVAILABILITY_SHARED_MUTEX                      \
+    __attribute__((availability(macos, strict, introduced = 10.12))) \
+    __attribute__((availability(ios, strict, introduced = 10.0)))    \
+    __attribute__((availability(tvos, strict, introduced = 10.0)))   \
+    __attribute__((availability(watchos, strict, introduced = 3.0)))
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)                                                       \
+       && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101200)                                                   \
+    || (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__)                                                     \
+        && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 100000)                                                 \
+    || (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 100000) \
+    || (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__)                                                      \
+        && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 30000)
+#    define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_mutex
+#    define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_shared_timed_mutex
+#  endif
+
+// Note: bad_optional_access & friends were not introduced in the matching
+// macOS and iOS versions, so the version mismatch between macOS and others
+// is intended.
+#  define _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS               \
+    __attribute__((availability(macos, strict, introduced = 10.13))) \
+    __attribute__((availability(ios, strict, introduced = 12.0)))    \
+    __attribute__((availability(tvos, strict, introduced = 12.0)))   \
+    __attribute__((availability(watchos, strict, introduced = 5.0)))
+#  define _LIBCUDACXX_AVAILABILITY_BAD_VARIANT_ACCESS _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
+#  define _LIBCUDACXX_AVAILABILITY_BAD_ANY_CAST       _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
+
+#  define _LIBCUDACXX_AVAILABILITY_UNCAUGHT_EXCEPTIONS               \
+    __attribute__((availability(macos, strict, introduced = 10.12))) \
+    __attribute__((availability(ios, strict, introduced = 10.0)))    \
+    __attribute__((availability(tvos, strict, introduced = 10.0)))   \
+    __attribute__((availability(watchos, strict, introduced = 3.0)))
+
+#  define _LIBCUDACXX_AVAILABILITY_SIZED_NEW_DELETE                  \
+    __attribute__((availability(macos, strict, introduced = 10.12))) \
+    __attribute__((availability(ios, strict, introduced = 10.0)))    \
+    __attribute__((availability(tvos, strict, introduced = 10.0)))   \
+    __attribute__((availability(watchos, strict, introduced = 3.0)))
+
+#  define _LIBCUDACXX_AVAILABILITY_FUTURE_ERROR __attribute__((availability(ios, strict, introduced = 6.0)))
+
+#  define _LIBCUDACXX_AVAILABILITY_TYPEINFO_VTABLE                  \
+    __attribute__((availability(macos, strict, introduced = 10.9))) \
+    __attribute__((availability(ios, strict, introduced = 7.0)))
+
+#  define _LIBCUDACXX_AVAILABILITY_LOCALE_CATEGORY                  \
+    __attribute__((availability(macos, strict, introduced = 10.9))) \
+    __attribute__((availability(ios, strict, introduced = 7.0)))
+
+#  define _LIBCUDACXX_AVAILABILITY_ATOMIC_SHARED_PTR                \
+    __attribute__((availability(macos, strict, introduced = 10.9))) \
+    __attribute__((availability(ios, strict, introduced = 7.0)))
+
+#  define _LIBCUDACXX_AVAILABILITY_FILESYSTEM                        \
+    __attribute__((availability(macos, strict, introduced = 10.15))) \
+    __attribute__((availability(ios, strict, introduced = 13.0)))    \
+    __attribute__((availability(tvos, strict, introduced = 13.0)))   \
+    __attribute__((availability(watchos, strict, introduced = 6.0)))
+#  define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_PUSH                                                      \
+    _Pragma("clang attribute push(__attribute__((availability(macos,strict,introduced=10.15))), "       \
+            "apply_to=any(function,record))")                                                           \
+      _Pragma("clang attribute push(__attribute__((availability(ios,strict,introduced=13.0))), "        \
+              "apply_to=any(function,record))")                                                         \
+        _Pragma("clang attribute push(__attribute__((availability(tvos,strict,introduced=13.0))), "     \
+                "apply_to=any(function,record))")                                                       \
+          _Pragma("clang attribute push(__attribute__((availability(watchos,strict,introduced=6.0))), " \
+                  "apply_to=any(function,record))")
+#  define _LIBCUDACXX_AVAILABILITY_FILESYSTEM_POP                                                \
+    _Pragma("clang attribute pop") _Pragma("clang attribute pop") _Pragma("clang attribute pop") \
+      _Pragma("clang attribute pop")
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)                                                       \
+       && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500)                                                   \
+    || (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__)                                                     \
+        && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000)                                                 \
+    || (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 130000) \
+    || (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__)                                                      \
+        && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 60000)
+#    define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_filesystem
+#  endif
+
+#  define _LIBCUDACXX_AVAILABILITY_TO_CHARS_FLOATING_POINT __attribute__((unavailable))
+
+#  define _LIBCUDACXX_AVAILABILITY_SYNC                             \
+    __attribute__((availability(macos, strict, introduced = 11.0))) \
+    __attribute__((availability(ios, strict, introduced = 14.0)))   \
+    __attribute__((availability(tvos, strict, introduced = 14.0)))  \
+    __attribute__((availability(watchos, strict, introduced = 7.0)))
+#  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)                                                       \
+       && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 110000)                                                   \
+    || (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__)                                                     \
+        && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 140000)                                                 \
+    || (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 140000) \
+    || (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__)                                                      \
+        && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 70000)
+#    define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait
+#    define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier
+#    define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_latch
+#    define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore
+#  endif
+
+#  define _LIBCUDACXX_AVAILABILITY_FORMAT __attribute__((unavailable))
+#  define _LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_format
+
+#  define _LIBCUDACXX_HAS_NO_VERBOSE_ABORT_IN_LIBRARY
 
 #else
 
 // ...New vendors can add availability markup here...
 
-#   error "It looks like you're trying to enable vendor availability markup, but you haven't defined the corresponding macros yet!"
+#  error \
+    "It looks like you're trying to enable vendor availability markup, but you haven't defined the corresponding macros yet!"
 
 #endif
 
@@ -290,15 +294,15 @@
 // Those are defined in terms of the availability attributes above, and
 // should not be vendor-specific.
 #if defined(_LIBCUDACXX_NO_EXCEPTIONS)
-#   define _LIBCUDACXX_AVAILABILITY_FUTURE
-#   define _LIBCUDACXX_AVAILABILITY_THROW_BAD_ANY_CAST
-#   define _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS
-#   define _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS
+#  define _LIBCUDACXX_AVAILABILITY_FUTURE
+#  define _LIBCUDACXX_AVAILABILITY_THROW_BAD_ANY_CAST
+#  define _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS
+#  define _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS
 #else
-#   define _LIBCUDACXX_AVAILABILITY_FUTURE                    _LIBCUDACXX_AVAILABILITY_FUTURE_ERROR
-#   define _LIBCUDACXX_AVAILABILITY_THROW_BAD_ANY_CAST        _LIBCUDACXX_AVAILABILITY_BAD_ANY_CAST
-#   define _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
-#   define _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS  _LIBCUDACXX_AVAILABILITY_BAD_VARIANT_ACCESS
+#  define _LIBCUDACXX_AVAILABILITY_FUTURE                    _LIBCUDACXX_AVAILABILITY_FUTURE_ERROR
+#  define _LIBCUDACXX_AVAILABILITY_THROW_BAD_ANY_CAST        _LIBCUDACXX_AVAILABILITY_BAD_ANY_CAST
+#  define _LIBCUDACXX_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS _LIBCUDACXX_AVAILABILITY_BAD_OPTIONAL_ACCESS
+#  define _LIBCUDACXX_AVAILABILITY_THROW_BAD_VARIANT_ACCESS  _LIBCUDACXX_AVAILABILITY_BAD_VARIANT_ACCESS
 #endif
 
 #endif // _LIBCUDACXX___AVAILABILITY
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__bit_reference b/libcudacxx/include/cuda/std/detail/libcxx/include/__bit_reference
index 4ce42eb4c6a..88325c3d5c9 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__bit_reference
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__bit_reference
@@ -10,9 +10,9 @@
 #ifndef _LIBCUDACXX___BIT_REFERENCE
 #define _LIBCUDACXX___BIT_REFERENCE
 
-##include <cuda/std/detail/__config>
-#include <bit>
+##include<cuda / std / detail / __config>
 #include <algorithm>
+#include <bit>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -22,229 +22,259 @@
 #  pragma system_header
 #endif // no system header
 
-_LIBCUDACXX_PUSH_MACROS
+  _LIBCUDACXX_PUSH_MACROS
 #include <__undef_macros>
 
+    _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
-_LIBCUDACXX_BEGIN_NAMESPACE_STD
-
-template <class _Cp, bool _IsConst, typename _Cp::__storage_type = 0> class __bit_iterator;
-template <class _Cp> class __bit_const_reference;
+template <class _Cp, bool _IsConst, typename _Cp::__storage_type = 0>
+class __bit_iterator;
+template <class _Cp>
+class __bit_const_reference;
 
 template <class _Tp>
 struct __has_storage_type
 {
-    static const bool value = false;
+  static const bool value = false;
 };
 
 template <class _Cp, bool = __has_storage_type<_Cp>::value>
 class __bit_reference
 {
-    typedef typename _Cp::__storage_type    __storage_type;
-    typedef typename _Cp::__storage_pointer __storage_pointer;
-
-    __storage_pointer __seg_;
-    __storage_type    __mask_;
+  typedef typename _Cp::__storage_type __storage_type;
+  typedef typename _Cp::__storage_pointer __storage_pointer;
 
-    friend typename _Cp::__self;
+  __storage_pointer __seg_;
+  __storage_type __mask_;
 
-    friend class __bit_const_reference<_Cp>;
-    friend class __bit_iterator<_Cp, false>;
-public:
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_reference(const __bit_reference&) = default;
+  friend typename _Cp::__self;
 
-    _LIBCUDACXX_INLINE_VISIBILITY operator bool() const noexcept
-        {return static_cast<bool>(*__seg_ & __mask_);}
-    _LIBCUDACXX_INLINE_VISIBILITY bool operator ~() const noexcept
-        {return !static_cast<bool>(*this);}
+  friend class __bit_const_reference<_Cp>;
+  friend class __bit_iterator<_Cp, false>;
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_reference& operator=(bool __x) noexcept
+public:
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_reference(const __bit_reference&) = default;
+
+  _LIBCUDACXX_INLINE_VISIBILITY operator bool() const noexcept
+  {
+    return static_cast<bool>(*__seg_ & __mask_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool operator~() const noexcept
+  {
+    return !static_cast<bool>(*this);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_reference& operator=(bool __x) noexcept
+  {
+    if (__x)
     {
-        if (__x)
-            *__seg_ |= __mask_;
-        else
-            *__seg_ &= ~__mask_;
-        return *this;
+      *__seg_ |= __mask_;
     }
+    else
+    {
+      *__seg_ &= ~__mask_;
+    }
+    return *this;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_reference& operator=(const __bit_reference& __x) noexcept
+  {
+    return operator=(static_cast<bool>(__x));
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept
+  {
+    *__seg_ ^= __mask_;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> operator&() const noexcept
+  {
+    return __bit_iterator<_Cp, false>(__seg_, static_cast<unsigned>(__libcpp_ctz(__mask_)));
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_reference& operator=(const __bit_reference& __x) noexcept
-        {return operator=(static_cast<bool>(__x));}
-
-    _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept {*__seg_ ^= __mask_;}
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> operator&() const noexcept
-        {return __bit_iterator<_Cp, false>(__seg_, static_cast<unsigned>(__libcpp_ctz(__mask_)));}
 private:
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_reference(__storage_pointer __s, __storage_type __m) noexcept
-        : __seg_(__s), __mask_(__m) {}
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_reference(__storage_pointer __s, __storage_type __m) noexcept
+      : __seg_(__s)
+      , __mask_(__m)
+  {}
 };
 
 template <class _Cp>
 class __bit_reference<_Cp, false>
-{
-};
+{};
 
 template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-swap(__bit_reference<_Cp> __x, __bit_reference<_Cp> __y) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void swap(__bit_reference<_Cp> __x, __bit_reference<_Cp> __y) noexcept
 {
-    bool __t = __x;
-    __x = __y;
-    __y = __t;
+  bool __t = __x;
+  __x      = __y;
+  __y      = __t;
 }
 
 template <class _Cp, class _Dp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-swap(__bit_reference<_Cp> __x, __bit_reference<_Dp> __y) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void swap(__bit_reference<_Cp> __x, __bit_reference<_Dp> __y) noexcept
 {
-    bool __t = __x;
-    __x = __y;
-    __y = __t;
+  bool __t = __x;
+  __x      = __y;
+  __y      = __t;
 }
 
 template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-swap(__bit_reference<_Cp> __x, bool& __y) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void swap(__bit_reference<_Cp> __x, bool& __y) noexcept
 {
-    bool __t = __x;
-    __x = __y;
-    __y = __t;
+  bool __t = __x;
+  __x      = __y;
+  __y      = __t;
 }
 
 template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-swap(bool& __x, __bit_reference<_Cp> __y) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void swap(bool& __x, __bit_reference<_Cp> __y) noexcept
 {
-    bool __t = __x;
-    __x = __y;
-    __y = __t;
+  bool __t = __x;
+  __x      = __y;
+  __y      = __t;
 }
 
 template <class _Cp>
 class __bit_const_reference
 {
-    typedef typename _Cp::__storage_type          __storage_type;
-    typedef typename _Cp::__const_storage_pointer __storage_pointer;
+  typedef typename _Cp::__storage_type __storage_type;
+  typedef typename _Cp::__const_storage_pointer __storage_pointer;
+
+  __storage_pointer __seg_;
+  __storage_type __mask_;
 
-    __storage_pointer        __seg_;
-    __storage_type __mask_;
+  friend typename _Cp::__self;
+  friend class __bit_iterator<_Cp, true>;
 
-    friend typename _Cp::__self;
-    friend class __bit_iterator<_Cp, true>;
 public:
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_const_reference(const __bit_const_reference&) = default;
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_const_reference(const __bit_const_reference&) = default;
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_const_reference(const __bit_reference<_Cp>& __x) noexcept
+      : __seg_(__x.__seg_)
+      , __mask_(__x.__mask_)
+  {}
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_const_reference(const __bit_reference<_Cp>& __x) noexcept
-        : __seg_(__x.__seg_), __mask_(__x.__mask_) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr operator bool() const noexcept
+  {
+    return static_cast<bool>(*__seg_ & __mask_);
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr operator bool() const noexcept
-        {return static_cast<bool>(*__seg_ & __mask_);}
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, true> operator&() const noexcept
+  {
+    return __bit_iterator<_Cp, true>(__seg_, static_cast<unsigned>(__libcpp_ctz(__mask_)));
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, true> operator&() const noexcept
-        {return __bit_iterator<_Cp, true>(__seg_, static_cast<unsigned>(__libcpp_ctz(__mask_)));}
 private:
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr
-    __bit_const_reference(__storage_pointer __s, __storage_type __m) noexcept
-        : __seg_(__s), __mask_(__m) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __bit_const_reference(__storage_pointer __s, __storage_type __m) noexcept
+      : __seg_(__s)
+      , __mask_(__m)
+  {}
 
-    __bit_const_reference& operator=(const __bit_const_reference&) = delete;
+  __bit_const_reference& operator=(const __bit_const_reference&) = delete;
 };
 
 // find
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, _IsConst>
-__find_bool_true(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
+__bit_iterator<_Cp, _IsConst> __find_bool_true(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _It;
-    typedef typename _It::__storage_type __storage_type;
-    static const int __bits_per_word = _It::__bits_per_word;
-    // do first partial word
-    if (__first.__ctz_ != 0)
+  typedef __bit_iterator<_Cp, _IsConst> _It;
+  typedef typename _It::__storage_type __storage_type;
+  static const int __bits_per_word = _It::__bits_per_word;
+  // do first partial word
+  if (__first.__ctz_ != 0)
+  {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    __storage_type __b     = *__first.__seg_ & __m;
+    if (__b)
     {
-        __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-        __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n);
-        __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-        __storage_type __b = *__first.__seg_ & __m;
-        if (__b)
-            return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
-        if (__n == __dn)
-            return __first + __n;
-        __n -= __dn;
-        ++__first.__seg_;
+      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
     }
-    // do middle whole words
-    for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
-        if (*__first.__seg_)
-            return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(*__first.__seg_)));
-    // do last partial word
-    if (__n > 0)
+    if (__n == __dn)
     {
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        __storage_type __b = *__first.__seg_ & __m;
-        if (__b)
-            return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
+      return __first + __n;
     }
-    return _It(__first.__seg_, static_cast<unsigned>(__n));
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
+  {
+    if (*__first.__seg_)
+    {
+      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(*__first.__seg_)));
+    }
+  }
+  // do last partial word
+  if (__n > 0)
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    __storage_type __b = *__first.__seg_ & __m;
+    if (__b)
+    {
+      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
+    }
+  }
+  return _It(__first.__seg_, static_cast<unsigned>(__n));
 }
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, _IsConst>
-__find_bool_false(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
+__bit_iterator<_Cp, _IsConst> __find_bool_false(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _It;
-    typedef typename _It::__storage_type __storage_type;
-    const int __bits_per_word = _It::__bits_per_word;
-    // do first partial word
-    if (__first.__ctz_ != 0)
+  typedef __bit_iterator<_Cp, _IsConst> _It;
+  typedef typename _It::__storage_type __storage_type;
+  const int __bits_per_word = _It::__bits_per_word;
+  // do first partial word
+  if (__first.__ctz_ != 0)
+  {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    __storage_type __b     = ~*__first.__seg_ & __m;
+    if (__b)
     {
-        __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-        __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n);
-        __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-        __storage_type __b = ~*__first.__seg_ & __m;
-        if (__b)
-            return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
-        if (__n == __dn)
-            return __first + __n;
-        __n -= __dn;
-        ++__first.__seg_;
+      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
     }
-    // do middle whole words
-    for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
+    if (__n == __dn)
     {
-        __storage_type __b = ~*__first.__seg_;
-        if (__b)
-            return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
+      return __first + __n;
     }
-    // do last partial word
-    if (__n > 0)
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
+  {
+    __storage_type __b = ~*__first.__seg_;
+    if (__b)
     {
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        __storage_type __b = ~*__first.__seg_ & __m;
-        if (__b)
-            return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
+      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
     }
-    return _It(__first.__seg_, static_cast<unsigned>(__n));
+  }
+  // do last partial word
+  if (__n > 0)
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    __storage_type __b = ~*__first.__seg_ & __m;
+    if (__b)
+    {
+      return _It(__first.__seg_, static_cast<unsigned>(_CUDA_VSTD::__libcpp_ctz(__b)));
+    }
+  }
+  return _It(__first.__seg_, static_cast<unsigned>(__n));
 }
 
 template <class _Cp, bool _IsConst, class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-__bit_iterator<_Cp, _IsConst>
+inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, _IsConst>
 find(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value_)
 {
-    if (static_cast<bool>(__value_))
-        return __find_bool_true(__first, static_cast<typename _Cp::size_type>(__last - __first));
-    return __find_bool_false(__first, static_cast<typename _Cp::size_type>(__last - __first));
+  if (static_cast<bool>(__value_))
+  {
+    return __find_bool_true(__first, static_cast<typename _Cp::size_type>(__last - __first));
+  }
+  return __find_bool_false(__first, static_cast<typename _Cp::size_type>(__last - __first));
 }
 
 // count
@@ -253,627 +283,633 @@ template <class _Cp, bool _IsConst>
 typename __bit_iterator<_Cp, _IsConst>::difference_type
 __count_bool_true(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _It;
-    typedef typename _It::__storage_type __storage_type;
-    typedef typename _It::difference_type difference_type;
-    const int __bits_per_word = _It::__bits_per_word;
-    difference_type __r = 0;
-    // do first partial word
-    if (__first.__ctz_ != 0)
-    {
-        __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-        __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n);
-        __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-        __r = _CUDA_VSTD::__libcpp_popcount(*__first.__seg_ & __m);
-        __n -= __dn;
-        ++__first.__seg_;
-    }
-    // do middle whole words
-    for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
-        __r += _CUDA_VSTD::__libcpp_popcount(*__first.__seg_);
-    // do last partial word
-    if (__n > 0)
-    {
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        __r += _CUDA_VSTD::__libcpp_popcount(*__first.__seg_ & __m);
-    }
-    return __r;
+  typedef __bit_iterator<_Cp, _IsConst> _It;
+  typedef typename _It::__storage_type __storage_type;
+  typedef typename _It::difference_type difference_type;
+  const int __bits_per_word = _It::__bits_per_word;
+  difference_type __r       = 0;
+  // do first partial word
+  if (__first.__ctz_ != 0)
+  {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    __r                    = _CUDA_VSTD::__libcpp_popcount(*__first.__seg_ & __m);
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
+  {
+    __r += _CUDA_VSTD::__libcpp_popcount(*__first.__seg_);
+  }
+  // do last partial word
+  if (__n > 0)
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    __r += _CUDA_VSTD::__libcpp_popcount(*__first.__seg_ & __m);
+  }
+  return __r;
 }
 
 template <class _Cp, bool _IsConst>
 typename __bit_iterator<_Cp, _IsConst>::difference_type
 __count_bool_false(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _It;
-    typedef typename _It::__storage_type __storage_type;
-    typedef typename _It::difference_type difference_type;
-    const int __bits_per_word = _It::__bits_per_word;
-    difference_type __r = 0;
-    // do first partial word
-    if (__first.__ctz_ != 0)
-    {
-        __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-        __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n);
-        __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-        __r = _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_ & __m);
-        __n -= __dn;
-        ++__first.__seg_;
-    }
-    // do middle whole words
-    for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
-        __r += _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_);
-    // do last partial word
-    if (__n > 0)
-    {
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        __r += _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_ & __m);
-    }
-    return __r;
+  typedef __bit_iterator<_Cp, _IsConst> _It;
+  typedef typename _It::__storage_type __storage_type;
+  typedef typename _It::difference_type difference_type;
+  const int __bits_per_word = _It::__bits_per_word;
+  difference_type __r       = 0;
+  // do first partial word
+  if (__first.__ctz_ != 0)
+  {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    __r                    = _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_ & __m);
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
+  {
+    __r += _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_);
+  }
+  // do last partial word
+  if (__n > 0)
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    __r += _CUDA_VSTD::__libcpp_popcount(~*__first.__seg_ & __m);
+  }
+  return __r;
 }
 
 template <class _Cp, bool _IsConst, class _Tp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-typename __bit_iterator<_Cp, _IsConst>::difference_type
+inline _LIBCUDACXX_INLINE_VISIBILITY typename __bit_iterator<_Cp, _IsConst>::difference_type
 count(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, const _Tp& __value_)
 {
-    if (static_cast<bool>(__value_))
-        return __count_bool_true(__first, static_cast<typename _Cp::size_type>(__last - __first));
-    return __count_bool_false(__first, static_cast<typename _Cp::size_type>(__last - __first));
+  if (static_cast<bool>(__value_))
+  {
+    return __count_bool_true(__first, static_cast<typename _Cp::size_type>(__last - __first));
+  }
+  return __count_bool_false(__first, static_cast<typename _Cp::size_type>(__last - __first));
 }
 
 // fill_n
 
 template <class _Cp>
-void
-__fill_n_false(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n)
+void __fill_n_false(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n)
 {
-    typedef __bit_iterator<_Cp, false> _It;
-    typedef typename _It::__storage_type __storage_type;
-    const int __bits_per_word = _It::__bits_per_word;
-    // do first partial word
-    if (__first.__ctz_ != 0)
-    {
-        __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-        __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n);
-        __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-        *__first.__seg_ &= ~__m;
-        __n -= __dn;
-        ++__first.__seg_;
-    }
-    // do middle whole words
-    __storage_type __nw = __n / __bits_per_word;
-    _CUDA_VSTD::memset(_CUDA_VSTD::__to_raw_pointer(__first.__seg_), 0, __nw * sizeof(__storage_type));
-    __n -= __nw * __bits_per_word;
-    // do last partial word
-    if (__n > 0)
-    {
-        __first.__seg_ += __nw;
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        *__first.__seg_ &= ~__m;
-    }
+  typedef __bit_iterator<_Cp, false> _It;
+  typedef typename _It::__storage_type __storage_type;
+  const int __bits_per_word = _It::__bits_per_word;
+  // do first partial word
+  if (__first.__ctz_ != 0)
+  {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    *__first.__seg_ &= ~__m;
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  __storage_type __nw = __n / __bits_per_word;
+  _CUDA_VSTD::memset(_CUDA_VSTD::__to_raw_pointer(__first.__seg_), 0, __nw * sizeof(__storage_type));
+  __n -= __nw * __bits_per_word;
+  // do last partial word
+  if (__n > 0)
+  {
+    __first.__seg_ += __nw;
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    *__first.__seg_ &= ~__m;
+  }
 }
 
 template <class _Cp>
-void
-__fill_n_true(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n)
+void __fill_n_true(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n)
 {
-    typedef __bit_iterator<_Cp, false> _It;
-    typedef typename _It::__storage_type __storage_type;
-    const int __bits_per_word = _It::__bits_per_word;
-    // do first partial word
-    if (__first.__ctz_ != 0)
-    {
-        __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
-        __storage_type __dn = _CUDA_VSTD::min(__clz_f, __n);
-        __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-        *__first.__seg_ |= __m;
-        __n -= __dn;
-        ++__first.__seg_;
-    }
-    // do middle whole words
-    __storage_type __nw = __n / __bits_per_word;
-    _CUDA_VSTD::memset(_CUDA_VSTD::__to_raw_pointer(__first.__seg_), -1, __nw * sizeof(__storage_type));
-    __n -= __nw * __bits_per_word;
-    // do last partial word
-    if (__n > 0)
-    {
-        __first.__seg_ += __nw;
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        *__first.__seg_ |= __m;
-    }
+  typedef __bit_iterator<_Cp, false> _It;
+  typedef typename _It::__storage_type __storage_type;
+  const int __bits_per_word = _It::__bits_per_word;
+  // do first partial word
+  if (__first.__ctz_ != 0)
+  {
+    __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
+    __storage_type __dn    = _CUDA_VSTD::min(__clz_f, __n);
+    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    *__first.__seg_ |= __m;
+    __n -= __dn;
+    ++__first.__seg_;
+  }
+  // do middle whole words
+  __storage_type __nw = __n / __bits_per_word;
+  _CUDA_VSTD::memset(_CUDA_VSTD::__to_raw_pointer(__first.__seg_), -1, __nw * sizeof(__storage_type));
+  __n -= __nw * __bits_per_word;
+  // do last partial word
+  if (__n > 0)
+  {
+    __first.__seg_ += __nw;
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    *__first.__seg_ |= __m;
+  }
 }
 
 template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
+inline _LIBCUDACXX_INLINE_VISIBILITY void
 fill_n(__bit_iterator<_Cp, false> __first, typename _Cp::size_type __n, bool __value_)
 {
-    if (__n > 0)
+  if (__n > 0)
+  {
+    if (__value_)
     {
-        if (__value_)
-            __fill_n_true(__first, __n);
-        else
-            __fill_n_false(__first, __n);
+      __fill_n_true(__first, __n);
     }
+    else
+    {
+      __fill_n_false(__first, __n);
+    }
+  }
 }
 
 // fill
 
 template <class _Cp>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
+inline _LIBCUDACXX_INLINE_VISIBILITY void
 fill(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __last, bool __value_)
 {
-    _CUDA_VSTD::fill_n(__first, static_cast<typename _Cp::size_type>(__last - __first), __value_);
+  _CUDA_VSTD::fill_n(__first, static_cast<typename _Cp::size_type>(__last - __first), __value_);
 }
 
 // copy
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, false>
-__copy_aligned(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last,
-                                                     __bit_iterator<_Cp, false> __result)
+__bit_iterator<_Cp, false> __copy_aligned(
+  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _In;
-    typedef  typename _In::difference_type difference_type;
-    typedef typename _In::__storage_type __storage_type;
-    const int __bits_per_word = _In::__bits_per_word;
-    difference_type __n = __last - __first;
+  typedef __bit_iterator<_Cp, _IsConst> _In;
+  typedef typename _In::difference_type difference_type;
+  typedef typename _In::__storage_type __storage_type;
+  const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0)
+  {
+    // do first word
+    if (__first.__ctz_ != 0)
+    {
+      unsigned __clz       = __bits_per_word - __first.__ctz_;
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
+      __n -= __dn;
+      __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __b = *__first.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b;
+      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+      ++__first.__seg_;
+      // __first.__ctz_ = 0;
+    }
+    // __first.__ctz_ == 0;
+    // do middle words
+    __storage_type __nw = __n / __bits_per_word;
+    _CUDA_VSTD::memmove(_CUDA_VSTD::__to_raw_pointer(__result.__seg_),
+                        _CUDA_VSTD::__to_raw_pointer(__first.__seg_),
+                        __nw * sizeof(__storage_type));
+    __n -= __nw * __bits_per_word;
+    __result.__seg_ += __nw;
+    // do last word
     if (__n > 0)
     {
-        // do first word
-        if (__first.__ctz_ != 0)
-        {
-            unsigned __clz = __bits_per_word - __first.__ctz_;
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
-            __n -= __dn;
-            __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
-            __storage_type __b = *__first.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b;
-            __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-            __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_)  % __bits_per_word);
-            ++__first.__seg_;
-            // __first.__ctz_ = 0;
-        }
-        // __first.__ctz_ == 0;
-        // do middle words
-        __storage_type __nw = __n / __bits_per_word;
-        _CUDA_VSTD::memmove(_CUDA_VSTD::__to_raw_pointer(__result.__seg_),
-                       _CUDA_VSTD::__to_raw_pointer(__first.__seg_),
-                       __nw * sizeof(__storage_type));
-        __n -= __nw * __bits_per_word;
-        __result.__seg_ += __nw;
-        // do last word
-        if (__n > 0)
-        {
-            __first.__seg_ += __nw;
-            __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-            __storage_type __b = *__first.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b;
-            __result.__ctz_ = static_cast<unsigned>(__n);
-        }
+      __first.__seg_ += __nw;
+      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b = *__first.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b;
+      __result.__ctz_ = static_cast<unsigned>(__n);
     }
-    return __result;
+  }
+  return __result;
 }
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, false>
-__copy_unaligned(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last,
-                                                       __bit_iterator<_Cp, false> __result)
+__bit_iterator<_Cp, false> __copy_unaligned(
+  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _In;
-    typedef  typename _In::difference_type difference_type;
-    typedef typename _In::__storage_type __storage_type;
-    static const int __bits_per_word = _In::__bits_per_word;
-    difference_type __n = __last - __first;
+  typedef __bit_iterator<_Cp, _IsConst> _In;
+  typedef typename _In::difference_type difference_type;
+  typedef typename _In::__storage_type __storage_type;
+  static const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n              = __last - __first;
+  if (__n > 0)
+  {
+    // do first word
+    if (__first.__ctz_ != 0)
+    {
+      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz_f), __n);
+      __n -= __dn;
+      __storage_type __m   = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+      __storage_type __b   = *__first.__seg_ & __m;
+      unsigned __clz_r     = __bits_per_word - __result.__ctz_;
+      __storage_type __ddn = _CUDA_VSTD::min<__storage_type>(__dn, __clz_r);
+      __m                  = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
+      *__result.__seg_ &= ~__m;
+      if (__result.__ctz_ > __first.__ctz_)
+      {
+        *__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
+      }
+      else
+      {
+        *__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
+      }
+      __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
+      __dn -= __ddn;
+      if (__dn > 0)
+      {
+        __m = ~__storage_type(0) >> (__bits_per_word - __dn);
+        *__result.__seg_ &= ~__m;
+        *__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
+        __result.__ctz_ = static_cast<unsigned>(__dn);
+      }
+      ++__first.__seg_;
+      // __first.__ctz_ = 0;
+    }
+    // __first.__ctz_ == 0;
+    // do middle words
+    unsigned __clz_r   = __bits_per_word - __result.__ctz_;
+    __storage_type __m = ~__storage_type(0) << __result.__ctz_;
+    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_)
+    {
+      __storage_type __b = *__first.__seg_;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b << __result.__ctz_;
+      ++__result.__seg_;
+      *__result.__seg_ &= __m;
+      *__result.__seg_ |= __b >> __clz_r;
+    }
+    // do last word
     if (__n > 0)
     {
-        // do first word
-        if (__first.__ctz_ != 0)
-        {
-            unsigned __clz_f = __bits_per_word - __first.__ctz_;
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz_f), __n);
-            __n -= __dn;
-            __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-            __storage_type __b = *__first.__seg_ & __m;
-            unsigned __clz_r = __bits_per_word - __result.__ctz_;
-            __storage_type __ddn = _CUDA_VSTD::min<__storage_type>(__dn, __clz_r);
-            __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
-            *__result.__seg_ &= ~__m;
-            if (__result.__ctz_ > __first.__ctz_)
-                *__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
-            else
-                *__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
-            __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
-            __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_)  % __bits_per_word);
-            __dn -= __ddn;
-            if (__dn > 0)
-            {
-                __m = ~__storage_type(0) >> (__bits_per_word - __dn);
-                *__result.__seg_ &= ~__m;
-                *__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
-                __result.__ctz_ = static_cast<unsigned>(__dn);
-            }
-            ++__first.__seg_;
-            // __first.__ctz_ = 0;
-        }
-        // __first.__ctz_ == 0;
-        // do middle words
-        unsigned __clz_r = __bits_per_word - __result.__ctz_;
-        __storage_type __m = ~__storage_type(0) << __result.__ctz_;
-        for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_)
-        {
-            __storage_type __b = *__first.__seg_;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b << __result.__ctz_;
-            ++__result.__seg_;
-            *__result.__seg_ &= __m;
-            *__result.__seg_ |= __b >> __clz_r;
-        }
-        // do last word
-        if (__n > 0)
-        {
-            __m = ~__storage_type(0) >> (__bits_per_word - __n);
-            __storage_type __b = *__first.__seg_ & __m;
-            __storage_type __dn = _CUDA_VSTD::min(__n, static_cast<difference_type>(__clz_r));
-            __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b << __result.__ctz_;
-            __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-            __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_)  % __bits_per_word);
-            __n -= __dn;
-            if (__n > 0)
-            {
-                __m = ~__storage_type(0) >> (__bits_per_word - __n);
-                *__result.__seg_ &= ~__m;
-                *__result.__seg_ |= __b >> __dn;
-                __result.__ctz_ = static_cast<unsigned>(__n);
-            }
-        }
+      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b  = *__first.__seg_ & __m;
+      __storage_type __dn = _CUDA_VSTD::min(__n, static_cast<difference_type>(__clz_r));
+      __m                 = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b << __result.__ctz_;
+      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+      __n -= __dn;
+      if (__n > 0)
+      {
+        __m = ~__storage_type(0) >> (__bits_per_word - __n);
+        *__result.__seg_ &= ~__m;
+        *__result.__seg_ |= __b >> __dn;
+        __result.__ctz_ = static_cast<unsigned>(__n);
+      }
     }
-    return __result;
+  }
+  return __result;
 }
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-__bit_iterator<_Cp, false>
+inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
 copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    if (__first.__ctz_ == __result.__ctz_)
-        return __copy_aligned(__first, __last, __result);
-    return __copy_unaligned(__first, __last, __result);
+  if (__first.__ctz_ == __result.__ctz_)
+  {
+    return __copy_aligned(__first, __last, __result);
+  }
+  return __copy_unaligned(__first, __last, __result);
 }
 
 // copy_backward
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, false>
-__copy_backward_aligned(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last,
-                                                     __bit_iterator<_Cp, false> __result)
+__bit_iterator<_Cp, false> __copy_backward_aligned(
+  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _In;
-    typedef  typename _In::difference_type difference_type;
-    typedef typename _In::__storage_type __storage_type;
-    const int __bits_per_word = _In::__bits_per_word;
-    difference_type __n = __last - __first;
+  typedef __bit_iterator<_Cp, _IsConst> _In;
+  typedef typename _In::difference_type difference_type;
+  typedef typename _In::__storage_type __storage_type;
+  const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0)
+  {
+    // do first word
+    if (__last.__ctz_ != 0)
+    {
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__last.__ctz_), __n);
+      __n -= __dn;
+      unsigned __clz     = __bits_per_word - __last.__ctz_;
+      __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
+      __storage_type __b = *__last.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b;
+      __result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
+      // __last.__ctz_ = 0
+    }
+    // __last.__ctz_ == 0 || __n == 0
+    // __result.__ctz_ == 0 || __n == 0
+    // do middle words
+    __storage_type __nw = __n / __bits_per_word;
+    __result.__seg_ -= __nw;
+    __last.__seg_ -= __nw;
+    _CUDA_VSTD::memmove(_CUDA_VSTD::__to_raw_pointer(__result.__seg_),
+                        _CUDA_VSTD::__to_raw_pointer(__last.__seg_),
+                        __nw * sizeof(__storage_type));
+    __n -= __nw * __bits_per_word;
+    // do last word
     if (__n > 0)
     {
-        // do first word
-        if (__last.__ctz_ != 0)
-        {
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__last.__ctz_), __n);
-            __n -= __dn;
-            unsigned __clz = __bits_per_word - __last.__ctz_;
-            __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
-            __storage_type __b = *__last.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b;
-            __result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) +
-                                                       __result.__ctz_)  % __bits_per_word);
-            // __last.__ctz_ = 0
-         }
-        // __last.__ctz_ == 0 || __n == 0
-        // __result.__ctz_ == 0 || __n == 0
-        // do middle words
-        __storage_type __nw = __n / __bits_per_word;
-        __result.__seg_ -= __nw;
-        __last.__seg_ -= __nw;
-        _CUDA_VSTD::memmove(_CUDA_VSTD::__to_raw_pointer(__result.__seg_),
-                       _CUDA_VSTD::__to_raw_pointer(__last.__seg_),
-                       __nw * sizeof(__storage_type));
-        __n -= __nw * __bits_per_word;
-        // do last word
-        if (__n > 0)
-        {
-            __storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
-            __storage_type __b = *--__last.__seg_ & __m;
-            *--__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b;
-            __result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
-        }
+      __storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
+      __storage_type __b = *--__last.__seg_ & __m;
+      *--__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b;
+      __result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
     }
-    return __result;
+  }
+  return __result;
 }
 
 template <class _Cp, bool _IsConst>
-__bit_iterator<_Cp, false>
-__copy_backward_unaligned(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last,
-                                                       __bit_iterator<_Cp, false> __result)
+__bit_iterator<_Cp, false> __copy_backward_unaligned(
+  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    typedef __bit_iterator<_Cp, _IsConst> _In;
-    typedef  typename _In::difference_type difference_type;
-    typedef typename _In::__storage_type __storage_type;
-    const int __bits_per_word = _In::__bits_per_word;
-    difference_type __n = __last - __first;
-    if (__n > 0)
+  typedef __bit_iterator<_Cp, _IsConst> _In;
+  typedef typename _In::difference_type difference_type;
+  typedef typename _In::__storage_type __storage_type;
+  const int __bits_per_word = _In::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0)
+  {
+    // do first word
+    if (__last.__ctz_ != 0)
     {
-        // do first word
-        if (__last.__ctz_ != 0)
-        {
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__last.__ctz_), __n);
-            __n -= __dn;
-            unsigned __clz_l = __bits_per_word - __last.__ctz_;
-            __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
-            __storage_type __b = *__last.__seg_ & __m;
-            unsigned __clz_r = __bits_per_word - __result.__ctz_;
-            __storage_type __ddn = _CUDA_VSTD::min(__dn, static_cast<difference_type>(__result.__ctz_));
-            if (__ddn > 0)
-            {
-                __m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
-                *__result.__seg_ &= ~__m;
-                if (__result.__ctz_ > __last.__ctz_)
-                    *__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
-                else
-                    *__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
-                __result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) +
-                                                         __result.__ctz_)  % __bits_per_word);
-                __dn -= __ddn;
-            }
-            if (__dn > 0)
-            {
-                // __result.__ctz_ == 0
-                --__result.__seg_;
-                __result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
-                __m = ~__storage_type(0) << __result.__ctz_;
-                *__result.__seg_ &= ~__m;
-                __last.__ctz_ -= __dn + __ddn;
-                *__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
-            }
-            // __last.__ctz_ = 0
-         }
-        // __last.__ctz_ == 0 || __n == 0
-        // __result.__ctz_ != 0 || __n == 0
-        // do middle words
-        unsigned __clz_r = __bits_per_word - __result.__ctz_;
-        __storage_type __m = ~__storage_type(0) >> __clz_r;
-        for (; __n >= __bits_per_word; __n -= __bits_per_word)
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__last.__ctz_), __n);
+      __n -= __dn;
+      unsigned __clz_l     = __bits_per_word - __last.__ctz_;
+      __storage_type __m   = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
+      __storage_type __b   = *__last.__seg_ & __m;
+      unsigned __clz_r     = __bits_per_word - __result.__ctz_;
+      __storage_type __ddn = _CUDA_VSTD::min(__dn, static_cast<difference_type>(__result.__ctz_));
+      if (__ddn > 0)
+      {
+        __m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
+        *__result.__seg_ &= ~__m;
+        if (__result.__ctz_ > __last.__ctz_)
         {
-            __storage_type __b = *--__last.__seg_;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b >> __clz_r;
-            *--__result.__seg_ &= __m;
-            *__result.__seg_ |= __b << __result.__ctz_;
+          *__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
         }
-        // do last word
-        if (__n > 0)
+        else
         {
-            __m = ~__storage_type(0) << (__bits_per_word - __n);
-            __storage_type __b = *--__last.__seg_ & __m;
-            __clz_r = __bits_per_word - __result.__ctz_;
-            __storage_type __dn = _CUDA_VSTD::min(__n, static_cast<difference_type>(__result.__ctz_));
-            __m = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
-            __result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) +
-                                                     __result.__ctz_)  % __bits_per_word);
-            __n -= __dn;
-            if (__n > 0)
-            {
-                // __result.__ctz_ == 0
-                --__result.__seg_;
-                __result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
-                __m = ~__storage_type(0) << __result.__ctz_;
-                *__result.__seg_ &= ~__m;
-                *__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
-            }
+          *__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
         }
+        __result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
+        __dn -= __ddn;
+      }
+      if (__dn > 0)
+      {
+        // __result.__ctz_ == 0
+        --__result.__seg_;
+        __result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
+        __m             = ~__storage_type(0) << __result.__ctz_;
+        *__result.__seg_ &= ~__m;
+        __last.__ctz_ -= __dn + __ddn;
+        *__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
+      }
+      // __last.__ctz_ = 0
+    }
+    // __last.__ctz_ == 0 || __n == 0
+    // __result.__ctz_ != 0 || __n == 0
+    // do middle words
+    unsigned __clz_r   = __bits_per_word - __result.__ctz_;
+    __storage_type __m = ~__storage_type(0) >> __clz_r;
+    for (; __n >= __bits_per_word; __n -= __bits_per_word)
+    {
+      __storage_type __b = *--__last.__seg_;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b >> __clz_r;
+      *--__result.__seg_ &= __m;
+      *__result.__seg_ |= __b << __result.__ctz_;
     }
-    return __result;
+    // do last word
+    if (__n > 0)
+    {
+      __m                 = ~__storage_type(0) << (__bits_per_word - __n);
+      __storage_type __b  = *--__last.__seg_ & __m;
+      __clz_r             = __bits_per_word - __result.__ctz_;
+      __storage_type __dn = _CUDA_VSTD::min(__n, static_cast<difference_type>(__result.__ctz_));
+      __m                 = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
+      __result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
+      __n -= __dn;
+      if (__n > 0)
+      {
+        // __result.__ctz_ == 0
+        --__result.__seg_;
+        __result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
+        __m             = ~__storage_type(0) << __result.__ctz_;
+        *__result.__seg_ &= ~__m;
+        *__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
+      }
+    }
+  }
+  return __result;
 }
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-__bit_iterator<_Cp, false>
-copy_backward(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
+inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> copy_backward(
+  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    if (__last.__ctz_ == __result.__ctz_)
-        return __copy_backward_aligned(__first, __last, __result);
-    return __copy_backward_unaligned(__first, __last, __result);
+  if (__last.__ctz_ == __result.__ctz_)
+  {
+    return __copy_backward_aligned(__first, __last, __result);
+  }
+  return __copy_backward_unaligned(__first, __last, __result);
 }
 
 // move
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-__bit_iterator<_Cp, false>
+inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false>
 move(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    return _CUDA_VSTD::copy(__first, __last, __result);
+  return _CUDA_VSTD::copy(__first, __last, __result);
 }
 
 // move_backward
 
 template <class _Cp, bool _IsConst>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-__bit_iterator<_Cp, false>
-move_backward(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
+inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<_Cp, false> move_backward(
+  __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result)
 {
-    return _CUDA_VSTD::copy_backward(__first, __last, __result);
+  return _CUDA_VSTD::copy_backward(__first, __last, __result);
 }
 
 // swap_ranges
 
 template <class __C1, class __C2>
-__bit_iterator<__C2, false>
-__swap_ranges_aligned(__bit_iterator<__C1, false> __first, __bit_iterator<__C1, false> __last,
-                      __bit_iterator<__C2, false> __result)
+__bit_iterator<__C2, false> __swap_ranges_aligned(
+  __bit_iterator<__C1, false> __first, __bit_iterator<__C1, false> __last, __bit_iterator<__C2, false> __result)
 {
-    typedef __bit_iterator<__C1, false> _I1;
-    typedef  typename _I1::difference_type difference_type;
-    typedef typename _I1::__storage_type __storage_type;
-    const int __bits_per_word = _I1::__bits_per_word;
-    difference_type __n = __last - __first;
+  typedef __bit_iterator<__C1, false> _I1;
+  typedef typename _I1::difference_type difference_type;
+  typedef typename _I1::__storage_type __storage_type;
+  const int __bits_per_word = _I1::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0)
+  {
+    // do first word
+    if (__first.__ctz_ != 0)
+    {
+      unsigned __clz       = __bits_per_word - __first.__ctz_;
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
+      __n -= __dn;
+      __storage_type __m  = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      __storage_type __b1 = *__first.__seg_ & __m;
+      *__first.__seg_ &= ~__m;
+      __storage_type __b2 = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b1;
+      *__first.__seg_ |= __b2;
+      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+      ++__first.__seg_;
+      // __first.__ctz_ = 0;
+    }
+    // __first.__ctz_ == 0;
+    // do middle words
+    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_, ++__result.__seg_)
+    {
+      swap(*__first.__seg_, *__result.__seg_);
+    }
+    // do last word
     if (__n > 0)
     {
-        // do first word
-        if (__first.__ctz_ != 0)
-        {
-            unsigned __clz = __bits_per_word - __first.__ctz_;
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
-            __n -= __dn;
-            __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
-            __storage_type __b1 = *__first.__seg_ & __m;
-            *__first.__seg_ &= ~__m;
-            __storage_type __b2 = *__result.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b1;
-            *__first.__seg_  |= __b2;
-            __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-            __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_)  % __bits_per_word);
-            ++__first.__seg_;
-            // __first.__ctz_ = 0;
-        }
-        // __first.__ctz_ == 0;
-        // do middle words
-        for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_, ++__result.__seg_)
-            swap(*__first.__seg_, *__result.__seg_);
-        // do last word
-        if (__n > 0)
-        {
-            __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-            __storage_type __b1 = *__first.__seg_ & __m;
-            *__first.__seg_ &= ~__m;
-            __storage_type __b2 = *__result.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b1;
-            *__first.__seg_  |= __b2;
-            __result.__ctz_ = static_cast<unsigned>(__n);
-        }
+      __storage_type __m  = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b1 = *__first.__seg_ & __m;
+      *__first.__seg_ &= ~__m;
+      __storage_type __b2 = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b1;
+      *__first.__seg_ |= __b2;
+      __result.__ctz_ = static_cast<unsigned>(__n);
     }
-    return __result;
+  }
+  return __result;
 }
 
 template <class __C1, class __C2>
-__bit_iterator<__C2, false>
-__swap_ranges_unaligned(__bit_iterator<__C1, false> __first, __bit_iterator<__C1, false> __last,
-                        __bit_iterator<__C2, false> __result)
+__bit_iterator<__C2, false> __swap_ranges_unaligned(
+  __bit_iterator<__C1, false> __first, __bit_iterator<__C1, false> __last, __bit_iterator<__C2, false> __result)
 {
-    typedef __bit_iterator<__C1, false> _I1;
-    typedef  typename _I1::difference_type difference_type;
-    typedef typename _I1::__storage_type __storage_type;
-    const int __bits_per_word = _I1::__bits_per_word;
-    difference_type __n = __last - __first;
+  typedef __bit_iterator<__C1, false> _I1;
+  typedef typename _I1::difference_type difference_type;
+  typedef typename _I1::__storage_type __storage_type;
+  const int __bits_per_word = _I1::__bits_per_word;
+  difference_type __n       = __last - __first;
+  if (__n > 0)
+  {
+    // do first word
+    if (__first.__ctz_ != 0)
+    {
+      unsigned __clz_f     = __bits_per_word - __first.__ctz_;
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz_f), __n);
+      __n -= __dn;
+      __storage_type __m  = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+      __storage_type __b1 = *__first.__seg_ & __m;
+      *__first.__seg_ &= ~__m;
+      unsigned __clz_r     = __bits_per_word - __result.__ctz_;
+      __storage_type __ddn = _CUDA_VSTD::min<__storage_type>(__dn, __clz_r);
+      __m                  = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
+      __storage_type __b2  = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      if (__result.__ctz_ > __first.__ctz_)
+      {
+        unsigned __s = __result.__ctz_ - __first.__ctz_;
+        *__result.__seg_ |= __b1 << __s;
+        *__first.__seg_ |= __b2 >> __s;
+      }
+      else
+      {
+        unsigned __s = __first.__ctz_ - __result.__ctz_;
+        *__result.__seg_ |= __b1 >> __s;
+        *__first.__seg_ |= __b2 << __s;
+      }
+      __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
+      __dn -= __ddn;
+      if (__dn > 0)
+      {
+        __m  = ~__storage_type(0) >> (__bits_per_word - __dn);
+        __b2 = *__result.__seg_ & __m;
+        *__result.__seg_ &= ~__m;
+        unsigned __s = __first.__ctz_ + __ddn;
+        *__result.__seg_ |= __b1 >> __s;
+        *__first.__seg_ |= __b2 << __s;
+        __result.__ctz_ = static_cast<unsigned>(__dn);
+      }
+      ++__first.__seg_;
+      // __first.__ctz_ = 0;
+    }
+    // __first.__ctz_ == 0;
+    // do middle words
+    __storage_type __m = ~__storage_type(0) << __result.__ctz_;
+    unsigned __clz_r   = __bits_per_word - __result.__ctz_;
+    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_)
+    {
+      __storage_type __b1 = *__first.__seg_;
+      __storage_type __b2 = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b1 << __result.__ctz_;
+      *__first.__seg_ = __b2 >> __result.__ctz_;
+      ++__result.__seg_;
+      __b2 = *__result.__seg_ & ~__m;
+      *__result.__seg_ &= __m;
+      *__result.__seg_ |= __b1 >> __clz_r;
+      *__first.__seg_ |= __b2 << __clz_r;
+    }
+    // do last word
     if (__n > 0)
     {
-        // do first word
-        if (__first.__ctz_ != 0)
-        {
-            unsigned __clz_f = __bits_per_word - __first.__ctz_;
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz_f), __n);
-            __n -= __dn;
-            __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-            __storage_type __b1 = *__first.__seg_ & __m;
-            *__first.__seg_ &= ~__m;
-            unsigned __clz_r = __bits_per_word - __result.__ctz_;
-            __storage_type __ddn = _CUDA_VSTD::min<__storage_type>(__dn, __clz_r);
-            __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
-            __storage_type __b2 = *__result.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            if (__result.__ctz_ > __first.__ctz_)
-            {
-                unsigned __s = __result.__ctz_ - __first.__ctz_;
-                *__result.__seg_ |= __b1 << __s;
-                *__first.__seg_  |= __b2 >> __s;
-            }
-            else
-            {
-                unsigned __s = __first.__ctz_ - __result.__ctz_;
-                *__result.__seg_ |= __b1 >> __s;
-                *__first.__seg_  |= __b2 << __s;
-            }
-            __result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
-            __result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_)  % __bits_per_word);
-            __dn -= __ddn;
-            if (__dn > 0)
-            {
-                __m = ~__storage_type(0) >> (__bits_per_word - __dn);
-                __b2 = *__result.__seg_ & __m;
-                *__result.__seg_ &= ~__m;
-                unsigned __s = __first.__ctz_ + __ddn;
-                *__result.__seg_ |= __b1 >> __s;
-                *__first.__seg_  |= __b2 << __s;
-                __result.__ctz_ = static_cast<unsigned>(__dn);
-            }
-            ++__first.__seg_;
-            // __first.__ctz_ = 0;
-        }
-        // __first.__ctz_ == 0;
-        // do middle words
-        __storage_type __m = ~__storage_type(0) << __result.__ctz_;
-        unsigned __clz_r = __bits_per_word - __result.__ctz_;
-        for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_)
-        {
-            __storage_type __b1 = *__first.__seg_;
-            __storage_type __b2 = *__result.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b1 << __result.__ctz_;
-            *__first.__seg_  = __b2 >> __result.__ctz_;
-            ++__result.__seg_;
-            __b2 = *__result.__seg_ & ~__m;
-            *__result.__seg_ &= __m;
-            *__result.__seg_ |= __b1 >> __clz_r;
-            *__first.__seg_  |= __b2 << __clz_r;
-        }
-        // do last word
-        if (__n > 0)
-        {
-            __m = ~__storage_type(0) >> (__bits_per_word - __n);
-            __storage_type __b1 = *__first.__seg_ & __m;
-            *__first.__seg_ &= ~__m;
-            __storage_type __dn = _CUDA_VSTD::min<__storage_type>(__n, __clz_r);
-            __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
-            __storage_type __b2 = *__result.__seg_ & __m;
-            *__result.__seg_ &= ~__m;
-            *__result.__seg_ |= __b1 << __result.__ctz_;
-            *__first.__seg_  |= __b2 >> __result.__ctz_;
-            __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
-            __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_)  % __bits_per_word);
-            __n -= __dn;
-            if (__n > 0)
-            {
-                __m = ~__storage_type(0) >> (__bits_per_word - __n);
-                __b2 = *__result.__seg_ & __m;
-                *__result.__seg_ &= ~__m;
-                *__result.__seg_ |= __b1 >> __dn;
-                *__first.__seg_  |= __b2 << __dn;
-                __result.__ctz_ = static_cast<unsigned>(__n);
-            }
-        }
+      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b1 = *__first.__seg_ & __m;
+      *__first.__seg_ &= ~__m;
+      __storage_type __dn = _CUDA_VSTD::min<__storage_type>(__n, __clz_r);
+      __m                 = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
+      __storage_type __b2 = *__result.__seg_ & __m;
+      *__result.__seg_ &= ~__m;
+      *__result.__seg_ |= __b1 << __result.__ctz_;
+      *__first.__seg_ |= __b2 >> __result.__ctz_;
+      __result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
+      __result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
+      __n -= __dn;
+      if (__n > 0)
+      {
+        __m  = ~__storage_type(0) >> (__bits_per_word - __n);
+        __b2 = *__result.__seg_ & __m;
+        *__result.__seg_ &= ~__m;
+        *__result.__seg_ |= __b1 >> __dn;
+        *__first.__seg_ |= __b2 << __dn;
+        __result.__ctz_ = static_cast<unsigned>(__n);
+      }
     }
-    return __result;
+  }
+  return __result;
 }
 
 template <class __C1, class __C2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-__bit_iterator<__C2, false>
-swap_ranges(__bit_iterator<__C1, false> __first1, __bit_iterator<__C1, false> __last1,
-            __bit_iterator<__C2, false> __first2)
+inline _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator<__C2, false> swap_ranges(
+  __bit_iterator<__C1, false> __first1, __bit_iterator<__C1, false> __last1, __bit_iterator<__C2, false> __first2)
 {
-    if (__first1.__ctz_ == __first2.__ctz_)
-        return __swap_ranges_aligned(__first1, __last1, __first2);
-    return __swap_ranges_unaligned(__first1, __last1, __first2);
+  if (__first1.__ctz_ == __first2.__ctz_)
+  {
+    return __swap_ranges_aligned(__first1, __last1, __first2);
+  }
+  return __swap_ranges_unaligned(__first1, __last1, __first2);
 }
 
 // rotate
@@ -881,413 +917,469 @@ swap_ranges(__bit_iterator<__C1, false> __first1, __bit_iterator<__C1, false> __
 template <class _Cp>
 struct __bit_array
 {
-    typedef typename _Cp::difference_type difference_type;
-    typedef typename _Cp::__storage_type  __storage_type;
-    typedef typename _Cp::__storage_pointer __storage_pointer;
-    typedef typename _Cp::iterator        iterator;
-    static const unsigned __bits_per_word = _Cp::__bits_per_word;
-    static const unsigned _Np = 4;
-
-    difference_type __size_;
-    __storage_type __word_[_Np];
-
-    _LIBCUDACXX_INLINE_VISIBILITY static difference_type capacity()
-        {return static_cast<difference_type>(_Np * __bits_per_word);}
-    _LIBCUDACXX_INLINE_VISIBILITY explicit __bit_array(difference_type __s) : __size_(__s) {}
-    _LIBCUDACXX_INLINE_VISIBILITY iterator begin()
-    {
-        return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]), 0);
-    }
-    _LIBCUDACXX_INLINE_VISIBILITY iterator end()
-    {
-        return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]) + __size_ / __bits_per_word,
-                                                  static_cast<unsigned>(__size_ % __bits_per_word));
-    }
+  typedef typename _Cp::difference_type difference_type;
+  typedef typename _Cp::__storage_type __storage_type;
+  typedef typename _Cp::__storage_pointer __storage_pointer;
+  typedef typename _Cp::iterator iterator;
+  static const unsigned __bits_per_word = _Cp::__bits_per_word;
+  static const unsigned _Np             = 4;
+
+  difference_type __size_;
+  __storage_type __word_[_Np];
+
+  _LIBCUDACXX_INLINE_VISIBILITY static difference_type capacity()
+  {
+    return static_cast<difference_type>(_Np * __bits_per_word);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY explicit __bit_array(difference_type __s)
+      : __size_(__s)
+  {}
+  _LIBCUDACXX_INLINE_VISIBILITY iterator begin()
+  {
+    return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]), 0);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY iterator end()
+  {
+    return iterator(pointer_traits<__storage_pointer>::pointer_to(__word_[0]) + __size_ / __bits_per_word,
+                    static_cast<unsigned>(__size_ % __bits_per_word));
+  }
 };
 
 template <class _Cp>
 __bit_iterator<_Cp, false>
 rotate(__bit_iterator<_Cp, false> __first, __bit_iterator<_Cp, false> __middle, __bit_iterator<_Cp, false> __last)
 {
-    typedef __bit_iterator<_Cp, false> _I1;
-    typedef  typename _I1::difference_type difference_type;
-    difference_type __d1 = __middle - __first;
-    difference_type __d2 = __last - __middle;
-    _I1 __r = __first + __d2;
-    while (__d1 != 0 && __d2 != 0)
+  typedef __bit_iterator<_Cp, false> _I1;
+  typedef typename _I1::difference_type difference_type;
+  difference_type __d1 = __middle - __first;
+  difference_type __d2 = __last - __middle;
+  _I1 __r              = __first + __d2;
+  while (__d1 != 0 && __d2 != 0)
+  {
+    if (__d1 <= __d2)
     {
-        if (__d1 <= __d2)
-        {
-            if (__d1 <= __bit_array<_Cp>::capacity())
-            {
-                __bit_array<_Cp> __b(__d1);
-                _CUDA_VSTD::copy(__first, __middle, __b.begin());
-                _CUDA_VSTD::copy(__b.begin(), __b.end(), _CUDA_VSTD::copy(__middle, __last, __first));
-                break;
-            }
-            else
-            {
-                __bit_iterator<_Cp, false> __mp = _CUDA_VSTD::swap_ranges(__first, __middle, __middle);
-                __first = __middle;
-                __middle = __mp;
-                __d2 -= __d1;
-            }
-        }
-        else
-        {
-            if (__d2 <= __bit_array<_Cp>::capacity())
-            {
-                __bit_array<_Cp> __b(__d2);
-                _CUDA_VSTD::copy(__middle, __last, __b.begin());
-                _CUDA_VSTD::copy_backward(__b.begin(), __b.end(), _CUDA_VSTD::copy_backward(__first, __middle, __last));
-                break;
-            }
-            else
-            {
-                __bit_iterator<_Cp, false> __mp = __first + __d2;
-                _CUDA_VSTD::swap_ranges(__first, __mp, __middle);
-                __first = __mp;
-                __d1 -= __d2;
-            }
-        }
+      if (__d1 <= __bit_array<_Cp>::capacity())
+      {
+        __bit_array<_Cp> __b(__d1);
+        _CUDA_VSTD::copy(__first, __middle, __b.begin());
+        _CUDA_VSTD::copy(__b.begin(), __b.end(), _CUDA_VSTD::copy(__middle, __last, __first));
+        break;
+      }
+      else
+      {
+        __bit_iterator<_Cp, false> __mp = _CUDA_VSTD::swap_ranges(__first, __middle, __middle);
+        __first                         = __middle;
+        __middle                        = __mp;
+        __d2 -= __d1;
+      }
+    }
+    else
+    {
+      if (__d2 <= __bit_array<_Cp>::capacity())
+      {
+        __bit_array<_Cp> __b(__d2);
+        _CUDA_VSTD::copy(__middle, __last, __b.begin());
+        _CUDA_VSTD::copy_backward(__b.begin(), __b.end(), _CUDA_VSTD::copy_backward(__first, __middle, __last));
+        break;
+      }
+      else
+      {
+        __bit_iterator<_Cp, false> __mp = __first + __d2;
+        _CUDA_VSTD::swap_ranges(__first, __mp, __middle);
+        __first = __mp;
+        __d1 -= __d2;
+      }
     }
-    return __r;
+  }
+  return __r;
 }
 
 // equal
 
 template <class _Cp, bool _IC1, bool _IC2>
-bool
-__equal_unaligned(__bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1,
-                  __bit_iterator<_Cp, _IC2> __first2)
+bool __equal_unaligned(
+  __bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2)
 {
-    typedef __bit_iterator<_Cp, _IC1> _It;
-    typedef  typename _It::difference_type difference_type;
-    typedef typename _It::__storage_type __storage_type;
-    static const int __bits_per_word = _It::__bits_per_word;
-    difference_type __n = __last1 - __first1;
-    if (__n > 0)
+  typedef __bit_iterator<_Cp, _IC1> _It;
+  typedef typename _It::difference_type difference_type;
+  typedef typename _It::__storage_type __storage_type;
+  static const int __bits_per_word = _It::__bits_per_word;
+  difference_type __n              = __last1 - __first1;
+  if (__n > 0)
+  {
+    // do first word
+    if (__first1.__ctz_ != 0)
     {
-        // do first word
-        if (__first1.__ctz_ != 0)
+      unsigned __clz_f     = __bits_per_word - __first1.__ctz_;
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz_f), __n);
+      __n -= __dn;
+      __storage_type __m   = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+      __storage_type __b   = *__first1.__seg_ & __m;
+      unsigned __clz_r     = __bits_per_word - __first2.__ctz_;
+      __storage_type __ddn = _CUDA_VSTD::min<__storage_type>(__dn, __clz_r);
+      __m                  = (~__storage_type(0) << __first2.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
+      if (__first2.__ctz_ > __first1.__ctz_)
+      {
+        if ((*__first2.__seg_ & __m) != (__b << (__first2.__ctz_ - __first1.__ctz_)))
         {
-            unsigned __clz_f = __bits_per_word - __first1.__ctz_;
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz_f), __n);
-            __n -= __dn;
-            __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-            __storage_type __b = *__first1.__seg_ & __m;
-            unsigned __clz_r = __bits_per_word - __first2.__ctz_;
-            __storage_type __ddn = _CUDA_VSTD::min<__storage_type>(__dn, __clz_r);
-            __m = (~__storage_type(0) << __first2.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
-            if (__first2.__ctz_ > __first1.__ctz_)
-            {
-                if ((*__first2.__seg_ & __m) != (__b << (__first2.__ctz_ - __first1.__ctz_)))
-                    return false;
-            }
-            else
-            {
-                if ((*__first2.__seg_ & __m) != (__b >> (__first1.__ctz_ - __first2.__ctz_)))
-                    return false;
-            }
-            __first2.__seg_ += (__ddn + __first2.__ctz_) / __bits_per_word;
-            __first2.__ctz_ = static_cast<unsigned>((__ddn + __first2.__ctz_)  % __bits_per_word);
-            __dn -= __ddn;
-            if (__dn > 0)
-            {
-                __m = ~__storage_type(0) >> (__bits_per_word - __dn);
-                if ((*__first2.__seg_ & __m) != (__b >> (__first1.__ctz_ + __ddn)))
-                    return false;
-                __first2.__ctz_ = static_cast<unsigned>(__dn);
-            }
-            ++__first1.__seg_;
-            // __first1.__ctz_ = 0;
+          return false;
         }
-        // __first1.__ctz_ == 0;
-        // do middle words
-        unsigned __clz_r = __bits_per_word - __first2.__ctz_;
-        __storage_type __m = ~__storage_type(0) << __first2.__ctz_;
-        for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first1.__seg_)
+      }
+      else
+      {
+        if ((*__first2.__seg_ & __m) != (__b >> (__first1.__ctz_ - __first2.__ctz_)))
         {
-            __storage_type __b = *__first1.__seg_;
-            if ((*__first2.__seg_ & __m) != (__b << __first2.__ctz_))
-                return false;
-            ++__first2.__seg_;
-            if ((*__first2.__seg_ & ~__m) != (__b >> __clz_r))
-                return false;
+          return false;
         }
-        // do last word
-        if (__n > 0)
+      }
+      __first2.__seg_ += (__ddn + __first2.__ctz_) / __bits_per_word;
+      __first2.__ctz_ = static_cast<unsigned>((__ddn + __first2.__ctz_) % __bits_per_word);
+      __dn -= __ddn;
+      if (__dn > 0)
+      {
+        __m = ~__storage_type(0) >> (__bits_per_word - __dn);
+        if ((*__first2.__seg_ & __m) != (__b >> (__first1.__ctz_ + __ddn)))
         {
-            __m = ~__storage_type(0) >> (__bits_per_word - __n);
-            __storage_type __b = *__first1.__seg_ & __m;
-            __storage_type __dn = _CUDA_VSTD::min(__n, static_cast<difference_type>(__clz_r));
-            __m = (~__storage_type(0) << __first2.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
-            if ((*__first2.__seg_ & __m) != (__b << __first2.__ctz_))
-                return false;
-            __first2.__seg_ += (__dn + __first2.__ctz_) / __bits_per_word;
-            __first2.__ctz_ = static_cast<unsigned>((__dn + __first2.__ctz_)  % __bits_per_word);
-            __n -= __dn;
-            if (__n > 0)
-            {
-                __m = ~__storage_type(0) >> (__bits_per_word - __n);
-                if ((*__first2.__seg_ & __m) != (__b >> __dn))
-                    return false;
-            }
+          return false;
         }
+        __first2.__ctz_ = static_cast<unsigned>(__dn);
+      }
+      ++__first1.__seg_;
+      // __first1.__ctz_ = 0;
     }
-    return true;
+    // __first1.__ctz_ == 0;
+    // do middle words
+    unsigned __clz_r   = __bits_per_word - __first2.__ctz_;
+    __storage_type __m = ~__storage_type(0) << __first2.__ctz_;
+    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first1.__seg_)
+    {
+      __storage_type __b = *__first1.__seg_;
+      if ((*__first2.__seg_ & __m) != (__b << __first2.__ctz_))
+      {
+        return false;
+      }
+      ++__first2.__seg_;
+      if ((*__first2.__seg_ & ~__m) != (__b >> __clz_r))
+      {
+        return false;
+      }
+    }
+    // do last word
+    if (__n > 0)
+    {
+      __m                 = ~__storage_type(0) >> (__bits_per_word - __n);
+      __storage_type __b  = *__first1.__seg_ & __m;
+      __storage_type __dn = _CUDA_VSTD::min(__n, static_cast<difference_type>(__clz_r));
+      __m                 = (~__storage_type(0) << __first2.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
+      if ((*__first2.__seg_ & __m) != (__b << __first2.__ctz_))
+      {
+        return false;
+      }
+      __first2.__seg_ += (__dn + __first2.__ctz_) / __bits_per_word;
+      __first2.__ctz_ = static_cast<unsigned>((__dn + __first2.__ctz_) % __bits_per_word);
+      __n -= __dn;
+      if (__n > 0)
+      {
+        __m = ~__storage_type(0) >> (__bits_per_word - __n);
+        if ((*__first2.__seg_ & __m) != (__b >> __dn))
+        {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
 }
 
 template <class _Cp, bool _IC1, bool _IC2>
-bool
-__equal_aligned(__bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1,
-                __bit_iterator<_Cp, _IC2> __first2)
+bool __equal_aligned(
+  __bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2)
 {
-    typedef __bit_iterator<_Cp, _IC1> _It;
-    typedef  typename _It::difference_type difference_type;
-    typedef typename _It::__storage_type __storage_type;
-    static const int __bits_per_word = _It::__bits_per_word;
-    difference_type __n = __last1 - __first1;
+  typedef __bit_iterator<_Cp, _IC1> _It;
+  typedef typename _It::difference_type difference_type;
+  typedef typename _It::__storage_type __storage_type;
+  static const int __bits_per_word = _It::__bits_per_word;
+  difference_type __n              = __last1 - __first1;
+  if (__n > 0)
+  {
+    // do first word
+    if (__first1.__ctz_ != 0)
+    {
+      unsigned __clz       = __bits_per_word - __first1.__ctz_;
+      difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
+      __n -= __dn;
+      __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+      if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
+      {
+        return false;
+      }
+      ++__first2.__seg_;
+      ++__first1.__seg_;
+      // __first1.__ctz_ = 0;
+      // __first2.__ctz_ = 0;
+    }
+    // __first1.__ctz_ == 0;
+    // __first2.__ctz_ == 0;
+    // do middle words
+    for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first1.__seg_, ++__first2.__seg_)
+    {
+      if (*__first2.__seg_ != *__first1.__seg_)
+      {
+        return false;
+      }
+    }
+    // do last word
     if (__n > 0)
     {
-        // do first word
-        if (__first1.__ctz_ != 0)
-        {
-            unsigned __clz = __bits_per_word - __first1.__ctz_;
-            difference_type __dn = _CUDA_VSTD::min(static_cast<difference_type>(__clz), __n);
-            __n -= __dn;
-            __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
-            if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
-                return false;
-            ++__first2.__seg_;
-            ++__first1.__seg_;
-            // __first1.__ctz_ = 0;
-            // __first2.__ctz_ = 0;
-        }
-        // __first1.__ctz_ == 0;
-        // __first2.__ctz_ == 0;
-        // do middle words
-        for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first1.__seg_, ++__first2.__seg_)
-            if (*__first2.__seg_ != *__first1.__seg_)
-                return false;
-        // do last word
-        if (__n > 0)
-        {
-            __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-            if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
-                return false;
-        }
+      __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+      if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
+      {
+        return false;
+      }
     }
-    return true;
+  }
+  return true;
 }
 
 template <class _Cp, bool _IC1, bool _IC2>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
+inline _LIBCUDACXX_INLINE_VISIBILITY bool
 equal(__bit_iterator<_Cp, _IC1> __first1, __bit_iterator<_Cp, _IC1> __last1, __bit_iterator<_Cp, _IC2> __first2)
 {
-    if (__first1.__ctz_ == __first2.__ctz_)
-        return __equal_aligned(__first1, __last1, __first2);
-    return __equal_unaligned(__first1, __last1, __first2);
+  if (__first1.__ctz_ == __first2.__ctz_)
+  {
+    return __equal_aligned(__first1, __last1, __first2);
+  }
+  return __equal_unaligned(__first1, __last1, __first2);
 }
 
-template <class _Cp, bool _IsConst,
-          typename _Cp::__storage_type>
+template <class _Cp, bool _IsConst, typename _Cp::__storage_type>
 class __bit_iterator
 {
 public:
-    typedef typename _Cp::difference_type                                                          difference_type;
-    typedef bool                                                                                  value_type;
-    typedef __bit_iterator                                                                        pointer;
-    typedef typename conditional<_IsConst, __bit_const_reference<_Cp>, __bit_reference<_Cp> >::type reference;
-    typedef random_access_iterator_tag                                                            iterator_category;
+  typedef typename _Cp::difference_type difference_type;
+  typedef bool value_type;
+  typedef __bit_iterator pointer;
+  typedef typename conditional<_IsConst, __bit_const_reference<_Cp>, __bit_reference<_Cp>>::type reference;
+  typedef random_access_iterator_tag iterator_category;
 
 private:
-    typedef typename _Cp::__storage_type                                           __storage_type;
-    typedef typename conditional<_IsConst, typename _Cp::__const_storage_pointer,
-                                           typename _Cp::__storage_pointer>::type  __storage_pointer;
-    static const unsigned __bits_per_word = _Cp::__bits_per_word;
+  typedef typename _Cp::__storage_type __storage_type;
+  typedef typename conditional<_IsConst, typename _Cp::__const_storage_pointer, typename _Cp::__storage_pointer>::type
+    __storage_pointer;
+  static const unsigned __bits_per_word = _Cp::__bits_per_word;
 
-    __storage_pointer __seg_;
-    unsigned          __ctz_;
+  __storage_pointer __seg_;
+  unsigned __ctz_;
 
 public:
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator() noexcept
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator() noexcept
 #if _CCCL_STD_VER > 2011
-    : __seg_(nullptr), __ctz_(0)
+      : __seg_(nullptr)
+      , __ctz_(0)
 #endif
-    {}
-    // avoid re-declaring a copy constructor for the non-const version.
-    using __type_for_copy_to_const =
-      _If<_IsConst, __bit_iterator<_Cp, false>, struct __private_nat>;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_iterator(const __type_for_copy_to_const& __it) noexcept
-        : __seg_(__it.__seg_), __ctz_(__it.__ctz_) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY reference operator*() const noexcept
-        {return reference(__seg_, __storage_type(1) << __ctz_);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator++()
-    {
-        if (__ctz_ != __bits_per_word-1)
-            ++__ctz_;
-        else
-        {
-            __ctz_ = 0;
-            ++__seg_;
-        }
-        return *this;
-    }
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator++(int)
+  {}
+  // avoid re-declaring a copy constructor for the non-const version.
+  using __type_for_copy_to_const = _If<_IsConst, __bit_iterator<_Cp, false>, struct __private_nat>;
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator(const __type_for_copy_to_const& __it) noexcept
+      : __seg_(__it.__seg_)
+      , __ctz_(__it.__ctz_)
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY reference operator*() const noexcept
+  {
+    return reference(__seg_, __storage_type(1) << __ctz_);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator++()
+  {
+    if (__ctz_ != __bits_per_word - 1)
     {
-        __bit_iterator __tmp = *this;
-        ++(*this);
-        return __tmp;
+      ++__ctz_;
     }
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator--()
+    else
     {
-        if (__ctz_ != 0)
-            --__ctz_;
-        else
-        {
-            __ctz_ = __bits_per_word - 1;
-            --__seg_;
-        }
-        return *this;
+      __ctz_ = 0;
+      ++__seg_;
     }
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator--(int)
+    return *this;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator++(int)
+  {
+    __bit_iterator __tmp = *this;
+    ++(*this);
+    return __tmp;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator--()
+  {
+    if (__ctz_ != 0)
     {
-        __bit_iterator __tmp = *this;
-        --(*this);
-        return __tmp;
+      --__ctz_;
     }
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator+=(difference_type __n)
-    {
-        if (__n >= 0)
-            __seg_ += (__n + __ctz_) / __bits_per_word;
-        else
-            __seg_ += static_cast<difference_type>(__n - __bits_per_word + __ctz_ + 1)
-                    / static_cast<difference_type>(__bits_per_word);
-        __n &= (__bits_per_word - 1);
-        __ctz_ = static_cast<unsigned>((__n + __ctz_)  % __bits_per_word);
-        return *this;
-    }
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator-=(difference_type __n)
+    else
     {
-        return *this += -__n;
+      __ctz_ = __bits_per_word - 1;
+      --__seg_;
     }
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator+(difference_type __n) const
+    return *this;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator--(int)
+  {
+    __bit_iterator __tmp = *this;
+    --(*this);
+    return __tmp;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator+=(difference_type __n)
+  {
+    if (__n >= 0)
     {
-        __bit_iterator __t(*this);
-        __t += __n;
-        return __t;
+      __seg_ += (__n + __ctz_) / __bits_per_word;
     }
-
-    _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator-(difference_type __n) const
+    else
     {
-        __bit_iterator __t(*this);
-        __t -= __n;
-        return __t;
+      __seg_ += static_cast<difference_type>(__n - __bits_per_word + __ctz_ + 1)
+              / static_cast<difference_type>(__bits_per_word);
     }
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    friend __bit_iterator operator+(difference_type __n, const __bit_iterator& __it) {return __it + __n;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    friend difference_type operator-(const __bit_iterator& __x, const __bit_iterator& __y)
-        {return (__x.__seg_ - __y.__seg_) * __bits_per_word + __x.__ctz_ - __y.__ctz_;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY reference operator[](difference_type __n) const {return *(*this + __n);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY friend bool operator==(const __bit_iterator& __x, const __bit_iterator& __y)
-        {return __x.__seg_ == __y.__seg_ && __x.__ctz_ == __y.__ctz_;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY friend bool operator!=(const __bit_iterator& __x, const __bit_iterator& __y)
-        {return !(__x == __y);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY friend bool operator<(const __bit_iterator& __x, const __bit_iterator& __y)
-        {return __x.__seg_ < __y.__seg_ || (__x.__seg_ == __y.__seg_ && __x.__ctz_ < __y.__ctz_);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY friend bool operator>(const __bit_iterator& __x, const __bit_iterator& __y)
-        {return __y < __x;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY friend bool operator<=(const __bit_iterator& __x, const __bit_iterator& __y)
-        {return !(__y < __x);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY friend bool operator>=(const __bit_iterator& __x, const __bit_iterator& __y)
-        {return !(__x < __y);}
+    __n &= (__bits_per_word - 1);
+    __ctz_ = static_cast<unsigned>((__n + __ctz_) % __bits_per_word);
+    return *this;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator& operator-=(difference_type __n)
+  {
+    return *this += -__n;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator+(difference_type __n) const
+  {
+    __bit_iterator __t(*this);
+    __t += __n;
+    return __t;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator operator-(difference_type __n) const
+  {
+    __bit_iterator __t(*this);
+    __t -= __n;
+    return __t;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend __bit_iterator operator+(difference_type __n, const __bit_iterator& __it)
+  {
+    return __it + __n;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend difference_type operator-(const __bit_iterator& __x, const __bit_iterator& __y)
+  {
+    return (__x.__seg_ - __y.__seg_) * __bits_per_word + __x.__ctz_ - __y.__ctz_;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY reference operator[](difference_type __n) const
+  {
+    return *(*this + __n);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator==(const __bit_iterator& __x, const __bit_iterator& __y)
+  {
+    return __x.__seg_ == __y.__seg_ && __x.__ctz_ == __y.__ctz_;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator!=(const __bit_iterator& __x, const __bit_iterator& __y)
+  {
+    return !(__x == __y);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator<(const __bit_iterator& __x, const __bit_iterator& __y)
+  {
+    return __x.__seg_ < __y.__seg_ || (__x.__seg_ == __y.__seg_ && __x.__ctz_ < __y.__ctz_);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator>(const __bit_iterator& __x, const __bit_iterator& __y)
+  {
+    return __y < __x;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator<=(const __bit_iterator& __x, const __bit_iterator& __y)
+  {
+    return !(__y < __x);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY friend bool operator>=(const __bit_iterator& __x, const __bit_iterator& __y)
+  {
+    return !(__x < __y);
+  }
 
 private:
-    _LIBCUDACXX_INLINE_VISIBILITY
-    __bit_iterator(__storage_pointer __s, unsigned __ctz) noexcept
-        : __seg_(__s), __ctz_(__ctz) {}
-
-    friend typename _Cp::__self;
-
-    friend class __bit_reference<_Cp>;
-    friend class __bit_const_reference<_Cp>;
-    friend class __bit_iterator<_Cp, true>;
-    template <class _Dp> friend struct __bit_array;
-    template <class _Dp> friend void __fill_n_false(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
-    template <class _Dp> friend void __fill_n_true(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, false> __copy_aligned(__bit_iterator<_Dp, _IC> __first,
-                                                                                  __bit_iterator<_Dp, _IC> __last,
-                                                                                  __bit_iterator<_Dp, false> __result);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, false> __copy_unaligned(__bit_iterator<_Dp, _IC> __first,
-                                                                                    __bit_iterator<_Dp, _IC> __last,
-                                                                                    __bit_iterator<_Dp, false> __result);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, false> copy(__bit_iterator<_Dp, _IC> __first,
-                                                                        __bit_iterator<_Dp, _IC> __last,
-                                                                        __bit_iterator<_Dp, false> __result);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, false> __copy_backward_aligned(__bit_iterator<_Dp, _IC> __first,
-                                                                                           __bit_iterator<_Dp, _IC> __last,
-                                                                                           __bit_iterator<_Dp, false> __result);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, false> __copy_backward_unaligned(__bit_iterator<_Dp, _IC> __first,
-                                                                                             __bit_iterator<_Dp, _IC> __last,
-                                                                                             __bit_iterator<_Dp, false> __result);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, false> copy_backward(__bit_iterator<_Dp, _IC> __first,
-                                                                                 __bit_iterator<_Dp, _IC> __last,
-                                                                                 __bit_iterator<_Dp, false> __result);
-    template <class __C1, class __C2>friend __bit_iterator<__C2, false> __swap_ranges_aligned(__bit_iterator<__C1, false>,
-                                                                                           __bit_iterator<__C1, false>,
-                                                                                           __bit_iterator<__C2, false>);
-    template <class __C1, class __C2>friend __bit_iterator<__C2, false> __swap_ranges_unaligned(__bit_iterator<__C1, false>,
-                                                                                             __bit_iterator<__C1, false>,
-                                                                                             __bit_iterator<__C2, false>);
-    template <class __C1, class __C2>friend __bit_iterator<__C2, false> swap_ranges(__bit_iterator<__C1, false>,
-                                                                                 __bit_iterator<__C1, false>,
-                                                                                 __bit_iterator<__C2, false>);
-    template <class _Dp> friend __bit_iterator<_Dp, false> rotate(__bit_iterator<_Dp, false>,
-                                                                __bit_iterator<_Dp, false>,
-                                                                __bit_iterator<_Dp, false>);
-    template <class _Dp, bool _IC1, bool _IC2> friend bool __equal_aligned(__bit_iterator<_Dp, _IC1>,
-                                                    __bit_iterator<_Dp, _IC1>,
-                                                    __bit_iterator<_Dp, _IC2>);
-    template <class _Dp, bool _IC1, bool _IC2> friend bool __equal_unaligned(__bit_iterator<_Dp, _IC1>,
-                                                      __bit_iterator<_Dp, _IC1>,
-                                                      __bit_iterator<_Dp, _IC2>);
-    template <class _Dp, bool _IC1, bool _IC2> friend bool equal(__bit_iterator<_Dp, _IC1>,
-                                                                __bit_iterator<_Dp, _IC1>,
-                                                                __bit_iterator<_Dp, _IC2>);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, _IC> __find_bool_true(__bit_iterator<_Dp, _IC>,
-                                                                          typename _Dp::size_type);
-    template <class _Dp, bool _IC> friend __bit_iterator<_Dp, _IC> __find_bool_false(__bit_iterator<_Dp, _IC>,
-                                                                           typename _Dp::size_type);
-    template <class _Dp, bool _IC> friend typename __bit_iterator<_Dp, _IC>::difference_type
-                   __count_bool_true(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
-    template <class _Dp, bool _IC> friend typename __bit_iterator<_Dp, _IC>::difference_type
-                   __count_bool_false(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
+  _LIBCUDACXX_INLINE_VISIBILITY __bit_iterator(__storage_pointer __s, unsigned __ctz) noexcept
+      : __seg_(__s)
+      , __ctz_(__ctz)
+  {}
+
+  friend typename _Cp::__self;
+
+  friend class __bit_reference<_Cp>;
+  friend class __bit_const_reference<_Cp>;
+  friend class __bit_iterator<_Cp, true>;
+  template <class _Dp>
+  friend struct __bit_array;
+  template <class _Dp>
+  friend void __fill_n_false(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
+  template <class _Dp>
+  friend void __fill_n_true(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, false> __copy_aligned(
+    __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, false> __copy_unaligned(
+    __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, false>
+  copy(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, false> __copy_backward_aligned(
+    __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, false> __copy_backward_unaligned(
+    __bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, false>
+  copy_backward(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
+  template <class __C1, class __C2>
+  friend __bit_iterator<__C2, false>
+    __swap_ranges_aligned(__bit_iterator<__C1, false>, __bit_iterator<__C1, false>, __bit_iterator<__C2, false>);
+  template <class __C1, class __C2>
+  friend __bit_iterator<__C2, false>
+    __swap_ranges_unaligned(__bit_iterator<__C1, false>, __bit_iterator<__C1, false>, __bit_iterator<__C2, false>);
+  template <class __C1, class __C2>
+  friend __bit_iterator<__C2, false>
+    swap_ranges(__bit_iterator<__C1, false>, __bit_iterator<__C1, false>, __bit_iterator<__C2, false>);
+  template <class _Dp>
+  friend __bit_iterator<_Dp, false>
+    rotate(__bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>, __bit_iterator<_Dp, false>);
+  template <class _Dp, bool _IC1, bool _IC2>
+  friend bool __equal_aligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
+  template <class _Dp, bool _IC1, bool _IC2>
+  friend bool __equal_unaligned(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
+  template <class _Dp, bool _IC1, bool _IC2>
+  friend bool equal(__bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC1>, __bit_iterator<_Dp, _IC2>);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, _IC> __find_bool_true(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
+  template <class _Dp, bool _IC>
+  friend __bit_iterator<_Dp, _IC> __find_bool_false(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
+  template <class _Dp, bool _IC>
+  friend typename __bit_iterator<_Dp, _IC>::difference_type
+    __count_bool_true(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
+  template <class _Dp, bool _IC>
+  friend typename __bit_iterator<_Dp, _IC>::difference_type
+    __count_bool_false(__bit_iterator<_Dp, _IC>, typename _Dp::size_type);
 };
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
 _LIBCUDACXX_POP_MACROS
 
-#endif  // _LIBCUDACXX___BIT_REFERENCE
+#endif // _LIBCUDACXX___BIT_REFERENCE
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
index b7e8bcc3118..274d7e020b4 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
@@ -43,13 +43,13 @@
 #endif
 
 #if defined(_CCCL_COMPILER_MSVC)
-#if _MSC_VER < 1917
-#define _LIBCUDACXX_COMPILER_MSVC_2017
-#elif _MSC_VER < 1930
-#define _LIBCUDACXX_COMPILER_MSVC_2019
-#else
-#define _LIBCUDACXX_COMPILER_MSVC_2022
-#endif
+#  if _MSC_VER < 1917
+#    define _LIBCUDACXX_COMPILER_MSVC_2017
+#  elif _MSC_VER < 1930
+#    define _LIBCUDACXX_COMPILER_MSVC_2019
+#  else
+#    define _LIBCUDACXX_COMPILER_MSVC_2022
+#  endif
 #endif // defined(_LIBCUDACXX_COMPILER_MSVC)
 
 #if defined(_CCCL_CUDA_COMPILER_NVCC)
@@ -80,372 +80,368 @@
 
 // __config may be included in `extern "C"` contexts, switch back to include <nv/target>
 extern "C++" {
-#include <nv/target>
+#  include <nv/target>
 }
 
-#ifdef __GNUC__
-#  define _GNUC_VER (__GNUC__ * 100 + __GNUC_MINOR__)
-#else
-#  define _GNUC_VER 0
-#endif
+#  ifdef __GNUC__
+#    define _GNUC_VER (__GNUC__ * 100 + __GNUC_MINOR__)
+#  else
+#    define _GNUC_VER 0
+#  endif
 
-#define _LIBCUDACXX_VERSION 10000
+#  define _LIBCUDACXX_VERSION 10000
 
-#ifndef _LIBCUDACXX_ABI_VERSION
-#  define _LIBCUDACXX_ABI_VERSION 1
-#endif
+#  ifndef _LIBCUDACXX_ABI_VERSION
+#    define _LIBCUDACXX_ABI_VERSION 1
+#  endif
 
-#define _LIBCUDACXX_STD_VER _CCCL_STD_VER
+#  define _LIBCUDACXX_STD_VER _CCCL_STD_VER
 
-#if _CCCL_STD_VER < 2011
-#  error libcu++ requires C++11 or later
-#endif
+#  if _CCCL_STD_VER < 2011
+#    error libcu++ requires C++11 or later
+#  endif
 
-#if (defined(_CCCL_COMPILER_NVHPC) && defined(__linux__)) \
- || defined(_CCCL_COMPILER_NVRTC)
-    #define __ELF__
-#endif
+#  if (defined(_CCCL_COMPILER_NVHPC) && defined(__linux__)) || defined(_CCCL_COMPILER_NVRTC)
+#    define __ELF__
+#  endif
 
-#if defined(__ELF__)
-#  define _LIBCUDACXX_OBJECT_FORMAT_ELF   1
-#elif defined(__MACH__)
-#  define _LIBCUDACXX_OBJECT_FORMAT_MACHO 1
-#elif defined(_WIN32)
-#  define _LIBCUDACXX_OBJECT_FORMAT_COFF  1
-#elif defined(__wasm__)
-#  define _LIBCUDACXX_OBJECT_FORMAT_WASM  1
-#else
-#  error Unknown object file format
-#endif
+#  if defined(__ELF__)
+#    define _LIBCUDACXX_OBJECT_FORMAT_ELF 1
+#  elif defined(__MACH__)
+#    define _LIBCUDACXX_OBJECT_FORMAT_MACHO 1
+#  elif defined(_WIN32)
+#    define _LIBCUDACXX_OBJECT_FORMAT_COFF 1
+#  elif defined(__wasm__)
+#    define _LIBCUDACXX_OBJECT_FORMAT_WASM 1
+#  else
+#    error Unknown object file format
+#  endif
 
-#if defined(_LIBCUDACXX_ABI_UNSTABLE) || _LIBCUDACXX_ABI_VERSION >= 2 || defined(__cuda_std__)
+#  if defined(_LIBCUDACXX_ABI_UNSTABLE) || _LIBCUDACXX_ABI_VERSION >= 2 || defined(__cuda_std__)
 // Change short string representation so that string data starts at offset 0,
 // improving its alignment in some cases.
-#  define _LIBCUDACXX_ABI_ALTERNATE_STRING_LAYOUT
+#    define _LIBCUDACXX_ABI_ALTERNATE_STRING_LAYOUT
 // Fix deque iterator type in order to support incomplete types.
-#  define _LIBCUDACXX_ABI_INCOMPLETE_TYPES_IN_DEQUE
+#    define _LIBCUDACXX_ABI_INCOMPLETE_TYPES_IN_DEQUE
 // Fix undefined behavior in how std::list stores its linked nodes.
-#  define _LIBCUDACXX_ABI_LIST_REMOVE_NODE_POINTER_UB
+#    define _LIBCUDACXX_ABI_LIST_REMOVE_NODE_POINTER_UB
 // Fix undefined behavior in  how __tree stores its end and parent nodes.
-#  define _LIBCUDACXX_ABI_TREE_REMOVE_NODE_POINTER_UB
+#    define _LIBCUDACXX_ABI_TREE_REMOVE_NODE_POINTER_UB
 // Fix undefined behavior in how __hash_table stores its pointer types.
-#  define _LIBCUDACXX_ABI_FIX_UNORDERED_NODE_POINTER_UB
-#  define _LIBCUDACXX_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB
-#  define _LIBCUDACXX_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE
+#    define _LIBCUDACXX_ABI_FIX_UNORDERED_NODE_POINTER_UB
+#    define _LIBCUDACXX_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB
+#    define _LIBCUDACXX_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE
 // Don't use a nullptr_t simulation type in C++03 instead using C++11 nullptr
 // provided under the alternate keyword __nullptr, which changes the mangling
 // of nullptr_t. This option is ABI incompatible with GCC in C++03 mode.
-#  define _LIBCUDACXX_ABI_ALWAYS_USE_CXX11_NULLPTR
+#    define _LIBCUDACXX_ABI_ALWAYS_USE_CXX11_NULLPTR
 // Define the `pointer_safety` enum as a C++11 strongly typed enumeration
 // instead of as a class simulating an enum. If this option is enabled
 // `pointer_safety` and `get_pointer_safety()` will no longer be available
 // in C++03.
-#  define _LIBCUDACXX_ABI_POINTER_SAFETY_ENUM_TYPE
+#    define _LIBCUDACXX_ABI_POINTER_SAFETY_ENUM_TYPE
 // Define a key function for `bad_function_call` in the library, to centralize
 // its vtable and typeinfo to libc++ rather than having all other libraries
 // using that class define their own copies.
-#  define _LIBCUDACXX_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION
+#    define _LIBCUDACXX_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION
 // Enable optimized version of __do_get_(un)signed which avoids redundant copies.
-#  define _LIBCUDACXX_ABI_OPTIMIZED_LOCALE_NUM_GET
+#    define _LIBCUDACXX_ABI_OPTIMIZED_LOCALE_NUM_GET
 // Use the smallest possible integer type to represent the index of the variant.
 // Previously libc++ used "unsigned int" exclusively.
-#  define _LIBCUDACXX_ABI_VARIANT_INDEX_TYPE_OPTIMIZATION
+#    define _LIBCUDACXX_ABI_VARIANT_INDEX_TYPE_OPTIMIZATION
 // Unstable attempt to provide a more optimized std::function
-#  define _LIBCUDACXX_ABI_OPTIMIZED_FUNCTION
+#    define _LIBCUDACXX_ABI_OPTIMIZED_FUNCTION
 // All the regex constants must be distinct and nonzero.
-#  define _LIBCUDACXX_ABI_REGEX_CONSTANTS_NONZERO
-#elif _LIBCUDACXX_ABI_VERSION == 1
-#  if !defined(_LIBCUDACXX_OBJECT_FORMAT_COFF)
+#    define _LIBCUDACXX_ABI_REGEX_CONSTANTS_NONZERO
+#  elif _LIBCUDACXX_ABI_VERSION == 1
+#    if !defined(_LIBCUDACXX_OBJECT_FORMAT_COFF)
 // Enable compiling copies of now inline methods into the dylib to support
 // applications compiled against older libraries. This is unnecessary with
 // COFF dllexport semantics, since dllexport forces a non-inline definition
 // of inline functions to be emitted anyway. Our own non-inline copy would
 // conflict with the dllexport-emitted copy, so we disable it.
-#    define _LIBCUDACXX_DEPRECATED_ABI_LEGACY_LIBRARY_DEFINITIONS_FOR_INLINE_FUNCTIONS
+#      define _LIBCUDACXX_DEPRECATED_ABI_LEGACY_LIBRARY_DEFINITIONS_FOR_INLINE_FUNCTIONS
+#    endif
 #  endif
-#endif
 
-#ifndef __has_attribute
-#define __has_attribute(__x) 0
-#endif
+#  ifndef __has_attribute
+#    define __has_attribute(__x) 0
+#  endif
 
-#ifndef __has_builtin
-#define __has_builtin(__x) 0
-#endif
+#  ifndef __has_builtin
+#    define __has_builtin(__x) 0
+#  endif
 
-#ifndef __has_extension
-#define __has_extension(__x) 0
-#endif
+#  ifndef __has_extension
+#    define __has_extension(__x) 0
+#  endif
 
-#ifndef __has_feature
-#define __has_feature(__x) 0
-#endif
+#  ifndef __has_feature
+#    define __has_feature(__x) 0
+#  endif
 
-#ifndef __has_cpp_attribute
-#define __has_cpp_attribute(__x) 0
-#endif
+#  ifndef __has_cpp_attribute
+#    define __has_cpp_attribute(__x) 0
+#  endif
 
 // '__is_identifier' returns '0' if '__x' is a reserved identifier provided by
 // the compiler and '1' otherwise.
-#ifndef __is_identifier
-#define __is_identifier(__x) 1
-#endif
+#  ifndef __is_identifier
+#    define __is_identifier(__x) 1
+#  endif
 
-#ifndef __has_declspec_attribute
-#define __has_declspec_attribute(__x) 0
-#endif
+#  ifndef __has_declspec_attribute
+#    define __has_declspec_attribute(__x) 0
+#  endif
 
-#define __has_keyword(__x) !(__is_identifier(__x))
+#  define __has_keyword(__x) !(__is_identifier(__x))
 
-#ifndef __has_include
-#define __has_include(...) 0
-#endif
+#  ifndef __has_include
+#    define __has_include(...) 0
+#  endif
 
-#if !defined(_CCCL_CUDA_COMPILER_NVCC) && !defined(_CCCL_COMPILER_NVRTC)
+#  if !defined(_CCCL_CUDA_COMPILER_NVCC) && !defined(_CCCL_COMPILER_NVRTC)
 // If NVCC is not being used <complex> can safely use `long double` without warnings
-#  define _LIBCUDACXX_HAS_COMPLEX_LONG_DOUBLE
+#    define _LIBCUDACXX_HAS_COMPLEX_LONG_DOUBLE
 // NVCC does not have a way of silencing non '_' prefixed UDLs
-#  define _LIBCUDACXX_HAS_STL_LITERALS
-#endif
+#    define _LIBCUDACXX_HAS_STL_LITERALS
+#  endif
 
-#if defined(_CCCL_COMPILER_GCC) && __cplusplus < 201103L
-#error "libc++ does not support using GCC with C++03. Please enable C++11"
-#endif
+#  if defined(_CCCL_COMPILER_GCC) && __cplusplus < 201103L
+#    error "libc++ does not support using GCC with C++03. Please enable C++11"
+#  endif
 
 // FIXME: ABI detection should be done via compiler builtin macros. This
 // is just a placeholder until Clang implements such macros. For now assume
 // that Windows compilers pretending to be MSVC++ target the Microsoft ABI,
 // and allow the user to explicitly specify the ABI to handle cases where this
 // heuristic falls short.
-#if defined(_LIBCUDACXX_ABI_FORCE_ITANIUM) && defined(_LIBCUDACXX_ABI_FORCE_MICROSOFT)
-#  error "Only one of _LIBCUDACXX_ABI_FORCE_ITANIUM and _LIBCUDACXX_ABI_FORCE_MICROSOFT can be defined"
-#elif defined(_LIBCUDACXX_ABI_FORCE_ITANIUM)
-#  define _LIBCUDACXX_ABI_ITANIUM
-#elif defined(_LIBCUDACXX_ABI_FORCE_MICROSOFT)
-#  define _LIBCUDACXX_ABI_MICROSOFT
-#else
-#  if defined(_WIN32) && defined(_CCCL_COMPILER_MSVC)
+#  if defined(_LIBCUDACXX_ABI_FORCE_ITANIUM) && defined(_LIBCUDACXX_ABI_FORCE_MICROSOFT)
+#    error "Only one of _LIBCUDACXX_ABI_FORCE_ITANIUM and _LIBCUDACXX_ABI_FORCE_MICROSOFT can be defined"
+#  elif defined(_LIBCUDACXX_ABI_FORCE_ITANIUM)
+#    define _LIBCUDACXX_ABI_ITANIUM
+#  elif defined(_LIBCUDACXX_ABI_FORCE_MICROSOFT)
 #    define _LIBCUDACXX_ABI_MICROSOFT
 #  else
-#    define _LIBCUDACXX_ABI_ITANIUM
+#    if defined(_WIN32) && defined(_CCCL_COMPILER_MSVC)
+#      define _LIBCUDACXX_ABI_MICROSOFT
+#    else
+#      define _LIBCUDACXX_ABI_ITANIUM
+#    endif
 #  endif
-#endif
 
-#if defined(_LIBCUDACXX_ABI_MICROSOFT) && !defined(_LIBCUDACXX_NO_VCRUNTIME)
-# define _LIBCUDACXX_ABI_VCRUNTIME
-#endif
+#  if defined(_LIBCUDACXX_ABI_MICROSOFT) && !defined(_LIBCUDACXX_NO_VCRUNTIME)
+#    define _LIBCUDACXX_ABI_VCRUNTIME
+#  endif
 
 // Need to detect which libc we're using if we're on Linux.
-#if defined(__linux__)
-#  include <features.h>
-#  if defined(__GLIBC_PREREQ)
-#    define _LIBCUDACXX_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b)
-#  else
-#    define _LIBCUDACXX_GLIBC_PREREQ(a, b) 0
-#  endif // defined(__GLIBC_PREREQ)
-#endif // defined(__linux__)
-
-#ifdef __LITTLE_ENDIAN__
-#  if __LITTLE_ENDIAN__
-#    define _LIBCUDACXX_LITTLE_ENDIAN
-#  endif  // __LITTLE_ENDIAN__
-#endif  // __LITTLE_ENDIAN__
-
-#ifdef __BIG_ENDIAN__
-#  if __BIG_ENDIAN__
-#    define _LIBCUDACXX_BIG_ENDIAN
-#  endif  // __BIG_ENDIAN__
-#endif  // __BIG_ENDIAN__
+#  if defined(__linux__)
+#    include <features.h>
+#    if defined(__GLIBC_PREREQ)
+#      define _LIBCUDACXX_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b)
+#    else
+#      define _LIBCUDACXX_GLIBC_PREREQ(a, b) 0
+#    endif // defined(__GLIBC_PREREQ)
+#  endif // defined(__linux__)
+
+#  ifdef __LITTLE_ENDIAN__
+#    if __LITTLE_ENDIAN__
+#      define _LIBCUDACXX_LITTLE_ENDIAN
+#    endif // __LITTLE_ENDIAN__
+#  endif // __LITTLE_ENDIAN__
+
+#  ifdef __BIG_ENDIAN__
+#    if __BIG_ENDIAN__
+#      define _LIBCUDACXX_BIG_ENDIAN
+#    endif // __BIG_ENDIAN__
+#  endif // __BIG_ENDIAN__
+
+#  ifdef __BYTE_ORDER__
+#    if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#      define _LIBCUDACXX_LITTLE_ENDIAN
+#    elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#      define _LIBCUDACXX_BIG_ENDIAN
+#    endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#  endif // __BYTE_ORDER__
+
+#  ifdef __FreeBSD__
+#    include <sys/endian.h>
+#    if _BYTE_ORDER == _LITTLE_ENDIAN
+#      define _LIBCUDACXX_LITTLE_ENDIAN
+#    else // _BYTE_ORDER == _LITTLE_ENDIAN
+#      define _LIBCUDACXX_BIG_ENDIAN
+#    endif // _BYTE_ORDER == _LITTLE_ENDIAN
+#    ifndef __LONG_LONG_SUPPORTED
+#      define _LIBCUDACXX_HAS_NO_LONG_LONG
+#    endif // __LONG_LONG_SUPPORTED
+#  endif // __FreeBSD__
+
+#  ifdef __NetBSD__
+#    include <sys/endian.h>
+#    if _BYTE_ORDER == _LITTLE_ENDIAN
+#      define _LIBCUDACXX_LITTLE_ENDIAN
+#    else // _BYTE_ORDER == _LITTLE_ENDIAN
+#      define _LIBCUDACXX_BIG_ENDIAN
+#    endif // _BYTE_ORDER == _LITTLE_ENDIAN
+#    define _LIBCUDACXX_HAS_QUICK_EXIT
+#  endif // __NetBSD__
 
-#ifdef __BYTE_ORDER__
-#  if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#    define _LIBCUDACXX_LITTLE_ENDIAN
-#  elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#    define _LIBCUDACXX_BIG_ENDIAN
-#  endif // __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#endif // __BYTE_ORDER__
-
-#ifdef __FreeBSD__
-#  include <sys/endian.h>
-#  if _BYTE_ORDER == _LITTLE_ENDIAN
+#  if defined(_WIN32)
+#    define _LIBCUDACXX_WIN32API
 #    define _LIBCUDACXX_LITTLE_ENDIAN
-#  else  // _BYTE_ORDER == _LITTLE_ENDIAN
-#    define _LIBCUDACXX_BIG_ENDIAN
-#  endif  // _BYTE_ORDER == _LITTLE_ENDIAN
-#  ifndef __LONG_LONG_SUPPORTED
-#    define _LIBCUDACXX_HAS_NO_LONG_LONG
-#  endif  // __LONG_LONG_SUPPORTED
-#endif  // __FreeBSD__
-
-#ifdef __NetBSD__
-#  include <sys/endian.h>
-#  if _BYTE_ORDER == _LITTLE_ENDIAN
-#    define _LIBCUDACXX_LITTLE_ENDIAN
-#  else  // _BYTE_ORDER == _LITTLE_ENDIAN
-#    define _LIBCUDACXX_BIG_ENDIAN
-#  endif  // _BYTE_ORDER == _LITTLE_ENDIAN
-#  define _LIBCUDACXX_HAS_QUICK_EXIT
-#endif  // __NetBSD__
-
-#if defined(_WIN32)
-#  define _LIBCUDACXX_WIN32API
-#  define _LIBCUDACXX_LITTLE_ENDIAN
-#  define _LIBCUDACXX_SHORT_WCHAR   1
+#    define _LIBCUDACXX_SHORT_WCHAR 1
 // Both MinGW and native MSVC provide a "MSVC"-like environment
-#  define _LIBCUDACXX_MSVCRT_LIKE
+#    define _LIBCUDACXX_MSVCRT_LIKE
 // If mingw not explicitly detected, assume using MS C runtime only if
 // a MS compatibility version is specified.
-#  if defined(_CCCL_COMPILER_MSVC) && !defined(__MINGW32__)
-#    define _LIBCUDACXX_MSVCRT // Using Microsoft's C Runtime library
-#  endif
-#  if (defined(_M_AMD64) || defined(__x86_64__)) || (defined(_M_ARM) || defined(__arm__))
-#    define _LIBCUDACXX_HAS_BITSCAN64
-#  endif
-#  define _LIBCUDACXX_HAS_OPEN_WITH_WCHAR
-#  if defined(_LIBCUDACXX_MSVCRT)
-#    define _LIBCUDACXX_HAS_QUICK_EXIT
-#  endif
+#    if defined(_CCCL_COMPILER_MSVC) && !defined(__MINGW32__)
+#      define _LIBCUDACXX_MSVCRT // Using Microsoft's C Runtime library
+#    endif
+#    if (defined(_M_AMD64) || defined(__x86_64__)) || (defined(_M_ARM) || defined(__arm__))
+#      define _LIBCUDACXX_HAS_BITSCAN64
+#    endif
+#    define _LIBCUDACXX_HAS_OPEN_WITH_WCHAR
+#    if defined(_LIBCUDACXX_MSVCRT)
+#      define _LIBCUDACXX_HAS_QUICK_EXIT
+#    endif
 
 // Some CRT APIs are unavailable to store apps
-#  if defined(WINAPI_FAMILY)
-#    include <winapifamily.h>
-#    if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) &&                  \
-        (!defined(WINAPI_PARTITION_SYSTEM) ||                                  \
-         !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_SYSTEM))
-#      define _LIBCUDACXX_WINDOWS_STORE_APP
+#    if defined(WINAPI_FAMILY)
+#      include <winapifamily.h>
+#      if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) \
+        && (!defined(WINAPI_PARTITION_SYSTEM) || !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_SYSTEM))
+#        define _LIBCUDACXX_WINDOWS_STORE_APP
+#      endif
 #    endif
-#  endif
-#endif // defined(_WIN32)
+#  endif // defined(_WIN32)
 
-#ifdef __sun__
-#  include <sys/isa_defs.h>
-#  ifdef _LITTLE_ENDIAN
-#    define _LIBCUDACXX_LITTLE_ENDIAN
+#  ifdef __sun__
+#    include <sys/isa_defs.h>
+#    ifdef _LITTLE_ENDIAN
+#      define _LIBCUDACXX_LITTLE_ENDIAN
+#    else
+#      define _LIBCUDACXX_BIG_ENDIAN
+#    endif
+#  endif // __sun__
+
+#  if defined(__CloudABI__)
+// Certain architectures provide arc4random(). Prefer using
+// arc4random() over /dev/{u,}random to make it possible to obtain
+// random data even when using sandboxing mechanisms such as chroots,
+// Capsicum, etc.
+#    define _LIBCUDACXX_USING_ARC4_RANDOM
+#  elif defined(__Fuchsia__) || defined(__wasi__)
+#    define _LIBCUDACXX_USING_GETENTROPY
+#  elif defined(__native_client__)
+// NaCl's sandbox (which PNaCl also runs in) doesn't allow filesystem access,
+// including accesses to the special files under /dev. C++11's
+// std::random_device is instead exposed through a NaCl syscall.
+#    define _LIBCUDACXX_USING_NACL_RANDOM
+#  elif defined(_LIBCUDACXX_WIN32API)
+#    define _LIBCUDACXX_USING_WIN32_RANDOM
 #  else
-#    define _LIBCUDACXX_BIG_ENDIAN
-#  endif
-#endif // __sun__
-
-#if defined(__CloudABI__)
-   // Certain architectures provide arc4random(). Prefer using
-   // arc4random() over /dev/{u,}random to make it possible to obtain
-   // random data even when using sandboxing mechanisms such as chroots,
-   // Capsicum, etc.
-#  define _LIBCUDACXX_USING_ARC4_RANDOM
-#elif defined(__Fuchsia__) || defined(__wasi__)
-#  define _LIBCUDACXX_USING_GETENTROPY
-#elif defined(__native_client__)
-   // NaCl's sandbox (which PNaCl also runs in) doesn't allow filesystem access,
-   // including accesses to the special files under /dev. C++11's
-   // std::random_device is instead exposed through a NaCl syscall.
-#  define _LIBCUDACXX_USING_NACL_RANDOM
-#elif defined(_LIBCUDACXX_WIN32API)
-#  define _LIBCUDACXX_USING_WIN32_RANDOM
-#else
-#  define _LIBCUDACXX_USING_DEV_RANDOM
-#endif
+#    define _LIBCUDACXX_USING_DEV_RANDOM
+#  endif
 
-#ifndef _LIBCUDACXX_LITTLE_ENDIAN
-#if defined(_CCCL_COMPILER_NVRTC)
-#  define _LIBCUDACXX_LITTLE_ENDIAN
-#endif
-#endif // _LIBCUDACXX_LITTLE_ENDIAN
+#  ifndef _LIBCUDACXX_LITTLE_ENDIAN
+#    if defined(_CCCL_COMPILER_NVRTC)
+#      define _LIBCUDACXX_LITTLE_ENDIAN
+#    endif
+#  endif // _LIBCUDACXX_LITTLE_ENDIAN
+
+#  if !defined(_LIBCUDACXX_LITTLE_ENDIAN) && !defined(_LIBCUDACXX_BIG_ENDIAN)
+#    include <endian.h>
+#    if __BYTE_ORDER == __LITTLE_ENDIAN
+#      define _LIBCUDACXX_LITTLE_ENDIAN
+#    elif __BYTE_ORDER == __BIG_ENDIAN
+#      define _LIBCUDACXX_BIG_ENDIAN
+#    else // __BYTE_ORDER == __BIG_ENDIAN
+#      error unable to determine endian
+#    endif
+#  endif // !defined(_LIBCUDACXX_LITTLE_ENDIAN) && !defined(_LIBCUDACXX_BIG_ENDIAN)
 
-#if !defined(_LIBCUDACXX_LITTLE_ENDIAN) && !defined(_LIBCUDACXX_BIG_ENDIAN)
-#  include <endian.h>
-#  if __BYTE_ORDER == __LITTLE_ENDIAN
-#    define _LIBCUDACXX_LITTLE_ENDIAN
-#  elif __BYTE_ORDER == __BIG_ENDIAN
-#    define _LIBCUDACXX_BIG_ENDIAN
-#  else  // __BYTE_ORDER == __BIG_ENDIAN
-#    error unable to determine endian
+#  if __has_attribute(__no_sanitize__) && !defined(_CCCL_COMPILER_GCC)
+#    define _LIBCUDACXX_NO_CFI __attribute__((__no_sanitize__("cfi")))
+#  else
+#    define _LIBCUDACXX_NO_CFI
 #  endif
-#endif  // !defined(_LIBCUDACXX_LITTLE_ENDIAN) && !defined(_LIBCUDACXX_BIG_ENDIAN)
-
-#if __has_attribute(__no_sanitize__) && !defined(_CCCL_COMPILER_GCC)
-#  define _LIBCUDACXX_NO_CFI __attribute__((__no_sanitize__("cfi")))
-#else
-#  define _LIBCUDACXX_NO_CFI
-#endif
 
-#if (defined(__ISO_C_VISIBLE) && __ISO_C_VISIBLE >= 2011) || __cplusplus >= 201103L
-#  if defined(__FreeBSD__)
-#    define _LIBCUDACXX_HAS_QUICK_EXIT
-#    define _LIBCUDACXX_HAS_C11_FEATURES
-#  elif defined(__Fuchsia__) || defined(__wasi__)
-#    define _LIBCUDACXX_HAS_QUICK_EXIT
-#    define _LIBCUDACXX_HAS_TIMESPEC_GET
-#    define _LIBCUDACXX_HAS_C11_FEATURES
-#  elif defined(__linux__)
-#    if !defined(_LIBCUDACXX_HAS_MUSL_LIBC)
-#      if _LIBCUDACXX_GLIBC_PREREQ(2, 15) || defined(__BIONIC__)
-#        define _LIBCUDACXX_HAS_QUICK_EXIT
-#      endif
-#      if _LIBCUDACXX_GLIBC_PREREQ(2, 17)
-#        define _LIBCUDACXX_HAS_C11_FEATURES
-#        define _LIBCUDACXX_HAS_TIMESPEC_GET
-#      endif
-#    else // defined(_LIBCUDACXX_HAS_MUSL_LIBC)
+#  if (defined(__ISO_C_VISIBLE) && __ISO_C_VISIBLE >= 2011) || __cplusplus >= 201103L
+#    if defined(__FreeBSD__)
+#      define _LIBCUDACXX_HAS_QUICK_EXIT
+#      define _LIBCUDACXX_HAS_C11_FEATURES
+#    elif defined(__Fuchsia__) || defined(__wasi__)
 #      define _LIBCUDACXX_HAS_QUICK_EXIT
 #      define _LIBCUDACXX_HAS_TIMESPEC_GET
 #      define _LIBCUDACXX_HAS_C11_FEATURES
-#    endif
-#  endif // __linux__
-#endif
+#    elif defined(__linux__)
+#      if !defined(_LIBCUDACXX_HAS_MUSL_LIBC)
+#        if _LIBCUDACXX_GLIBC_PREREQ(2, 15) || defined(__BIONIC__)
+#          define _LIBCUDACXX_HAS_QUICK_EXIT
+#        endif
+#        if _LIBCUDACXX_GLIBC_PREREQ(2, 17)
+#          define _LIBCUDACXX_HAS_C11_FEATURES
+#          define _LIBCUDACXX_HAS_TIMESPEC_GET
+#        endif
+#      else // defined(_LIBCUDACXX_HAS_MUSL_LIBC)
+#        define _LIBCUDACXX_HAS_QUICK_EXIT
+#        define _LIBCUDACXX_HAS_TIMESPEC_GET
+#        define _LIBCUDACXX_HAS_C11_FEATURES
+#      endif
+#    endif // __linux__
+#  endif
 
-#if defined(_CCCL_COMPILER_NVRTC)
-#  define __alignof(x) alignof(x)
-#endif // _CCCL_COMPILER_NVRTC
+#  if defined(_CCCL_COMPILER_NVRTC)
+#    define __alignof(x) alignof(x)
+#  endif // _CCCL_COMPILER_NVRTC
 
-#if defined(_CCCL_COMPILER_MSVC)
-#  define __alignof__ __alignof
-#endif
+#  if defined(_CCCL_COMPILER_MSVC)
+#    define __alignof__ __alignof
+#  endif
 
-#define _LIBCUDACXX_ALIGNOF(_Tp) alignof(_Tp)
-#define _LIBCUDACXX_PREFERRED_ALIGNOF(_Tp) __alignof(_Tp)
+#  define _LIBCUDACXX_ALIGNOF(_Tp)           alignof(_Tp)
+#  define _LIBCUDACXX_PREFERRED_ALIGNOF(_Tp) __alignof(_Tp)
 
-#if defined(_CCCL_COMPILER_MSVC)
-#  define _CCCL_ALIGNAS_TYPE(x) alignas(x)
-#  define _CCCL_ALIGNAS(x) __declspec(align(x))
-#elif __has_feature(cxx_alignas)
-#  define _CCCL_ALIGNAS_TYPE(x) alignas(x)
-#  define _CCCL_ALIGNAS(x) alignas(x)
-#else
-#  define _CCCL_ALIGNAS_TYPE(x) __attribute__((__aligned__(_LIBCUDACXX_ALIGNOF(x))))
-#  define _CCCL_ALIGNAS(x) __attribute__((__aligned__(x)))
-#endif // !_CCCL_COMPILER_MSVC && !__has_feature(cxx_alignas)
-
-#define _LIBCUDACXX_TOSTRING2(_STR) #_STR
-#define _LIBCUDACXX_TOSTRING(_STR) _LIBCUDACXX_TOSTRING2(_STR)
+#  if defined(_CCCL_COMPILER_MSVC)
+#    define _CCCL_ALIGNAS_TYPE(x) alignas(x)
+#    define _CCCL_ALIGNAS(x)      __declspec(align(x))
+#  elif __has_feature(cxx_alignas)
+#    define _CCCL_ALIGNAS_TYPE(x) alignas(x)
+#    define _CCCL_ALIGNAS(x)      alignas(x)
+#  else
+#    define _CCCL_ALIGNAS_TYPE(x) __attribute__((__aligned__(_LIBCUDACXX_ALIGNOF(x))))
+#    define _CCCL_ALIGNAS(x)      __attribute__((__aligned__(x)))
+#  endif // !_CCCL_COMPILER_MSVC && !__has_feature(cxx_alignas)
+
+#  define _LIBCUDACXX_TOSTRING2(_STR) #_STR
+#  define _LIBCUDACXX_TOSTRING(_STR)  _LIBCUDACXX_TOSTRING2(_STR)
 
 // This is wrapped in __CUDA_ARCH__ to prevent error: "ignoring '#pragma unroll'
 // [-Werror=unknown-pragmas]"
-#if defined(__CUDA_ARCH__)
-#if defined(_CCCL_COMPILER_MSVC)
-#  define _LIBCUDACXX_PRAGMA_UNROLL(_N) __pragma(_LIBCUDACXX_TOSTRING(unroll _N))
-#else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
-#  define _LIBCUDACXX_PRAGMA_UNROLL(_N) _Pragma(_LIBCUDACXX_TOSTRING(unroll _N))
-#endif // !_CCCL_COMPILER_MSVC
-#else // ^^^ __CUDA_ARCH__ ^^^ / vvv !__CUDA_ARCH__ vvv
-#  define _LIBCUDACXX_PRAGMA_UNROLL(_N)
-#endif // !__CUDA_ARCH__
+#  if defined(__CUDA_ARCH__)
+#    if defined(_CCCL_COMPILER_MSVC)
+#      define _LIBCUDACXX_PRAGMA_UNROLL(_N) __pragma(_LIBCUDACXX_TOSTRING(unroll _N))
+#    else // ^^^ _CCCL_COMPILER_MSVC ^^^ / vvv !_CCCL_COMPILER_MSVC vvv
+#      define _LIBCUDACXX_PRAGMA_UNROLL(_N) _Pragma(_LIBCUDACXX_TOSTRING(unroll _N))
+#    endif // !_CCCL_COMPILER_MSVC
+#  else // ^^^ __CUDA_ARCH__ ^^^ / vvv !__CUDA_ARCH__ vvv
+#    define _LIBCUDACXX_PRAGMA_UNROLL(_N)
+#  endif // !__CUDA_ARCH__
 
-#if defined(_CCCL_COMPILER_MSVC)
-#define _LIBCUDACXX_ALWAYS_INLINE __forceinline
-#else
-#define _LIBCUDACXX_ALWAYS_INLINE __attribute__ ((__always_inline__))
-#endif // !_CCCL_COMPILER_MSVC
+#  if defined(_CCCL_COMPILER_MSVC)
+#    define _LIBCUDACXX_ALWAYS_INLINE __forceinline
+#  else
+#    define _LIBCUDACXX_ALWAYS_INLINE __attribute__((__always_inline__))
+#  endif // !_CCCL_COMPILER_MSVC
 
-#if defined(__cuda_std__)
-#define _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(size, ptr) (size <= 8)
-#elif defined(_CCCL_COMPILER_CLANG) || defined(_CCCL_COMPILER_GCC)
-#define _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(...) __atomic_always_lock_free(__VA_ARGS__)
-#endif // __cuda_std__
+#  if defined(__cuda_std__)
+#    define _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(size, ptr) (size <= 8)
+#  elif defined(_CCCL_COMPILER_CLANG) || defined(_CCCL_COMPILER_GCC)
+#    define _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(...) __atomic_always_lock_free(__VA_ARGS__)
+#  endif // __cuda_std__
 
 // https://bugs.llvm.org/show_bug.cgi?id=44517
-#define __check_builtin(__x) (__has_builtin(__##__x) || \
-                              __has_keyword(__##__x) || \
-                              __has_feature(__x))
+#  define __check_builtin(__x) (__has_builtin(__##__x) || __has_keyword(__##__x) || __has_feature(__x))
 
 // We work around old clang versions (before clang-10) not supporting __has_builtin via __check_builtin
 // We work around old intel versions (before 2021.3)   not supporting __has_builtin via __check_builtin
@@ -453,486 +449,422 @@ extern "C++" {
 // MSVC needs manual handling, has no real way of checking builtins so all is manual
 // GCC  needs manual handling, before gcc-10 as that finally supports __has_builtin
 
-#if __check_builtin(array_rank)
-#define _LIBCUDACXX_ARRAY_RANK(...) __array_rank(__VA_ARGS__)
-#endif // __check_builtin(array_rank)
+#  if __check_builtin(array_rank)
+#    define _LIBCUDACXX_ARRAY_RANK(...) __array_rank(__VA_ARGS__)
+#  endif // __check_builtin(array_rank)
 
 // nvhpc has a bug where it supports __builtin_addressof but does not mark it via __check_builtin
-#if __check_builtin(builtin_addressof)                       \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 700)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVHPC)
-#define _LIBCUDACXX_ADDRESSOF(...) __builtin_addressof(__VA_ARGS__)
-#endif // __check_builtin(builtin_addressof)
-
-#if __check_builtin(builtin_bit_cast) \
- || (defined(_CCCL_COMPILER_MSVC) && _MSC_VER  > 1925)
-#define _LIBCUDACXX_BIT_CAST(...) __builtin_bit_cast(__VA_ARGS__)
-#endif // __check_builtin(builtin_bit_cast)
-
-#if __check_builtin(builtin_is_constant_evaluated)           \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 900)       \
- || (defined(_CCCL_COMPILER_MSVC) && _MSC_VER  > 1924 && !defined(_CCCL_CUDACC_BELOW_11_3))
-#define _LIBCUDACXX_IS_CONSTANT_EVALUATED(...) __builtin_is_constant_evaluated(__VA_ARGS__)
-#endif // __check_builtin(builtin_is_constant_evaluated)
+#  if __check_builtin(builtin_addressof) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 700) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVHPC)
+#    define _LIBCUDACXX_ADDRESSOF(...) __builtin_addressof(__VA_ARGS__)
+#  endif // __check_builtin(builtin_addressof)
+
+#  if __check_builtin(builtin_bit_cast) || (defined(_CCCL_COMPILER_MSVC) && _MSC_VER > 1925)
+#    define _LIBCUDACXX_BIT_CAST(...) __builtin_bit_cast(__VA_ARGS__)
+#  endif // __check_builtin(builtin_bit_cast)
+
+#  if __check_builtin(builtin_is_constant_evaluated) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 900) \
+    || (defined(_CCCL_COMPILER_MSVC) && _MSC_VER > 1924 && !defined(_CCCL_CUDACC_BELOW_11_3))
+#    define _LIBCUDACXX_IS_CONSTANT_EVALUATED(...) __builtin_is_constant_evaluated(__VA_ARGS__)
+#  endif // __check_builtin(builtin_is_constant_evaluated)
 
 // NVCC and NVRTC in C++11 mode freaks out about `__builtin_is_constant_evaluated`.
-#if _CCCL_STD_VER < 2014              \
- && (defined(_CCCL_CUDA_COMPILER_NVCC)     \
- ||  defined(_CCCL_COMPILER_NVRTC)    \
- ||  defined(_CCCL_COMPILER_NVHPC))
-#undef _LIBCUDACXX_IS_CONSTANT_EVALUATED
-#endif // _CCCL_STD_VER < 2014 && _CCCL_CUDA_COMPILER_NVCC
-
-#if __check_builtin(builtin_launder)                         \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 700)
-#define _LIBCUDACXX_LAUNDER(...) __builtin_launder(__VA_ARGS__)
-#endif // __check_builtin(builtin_launder)
+#  if _CCCL_STD_VER < 2014 \
+    && (defined(_CCCL_CUDA_COMPILER_NVCC) || defined(_CCCL_COMPILER_NVRTC) || defined(_CCCL_COMPILER_NVHPC))
+#    undef _LIBCUDACXX_IS_CONSTANT_EVALUATED
+#  endif // _CCCL_STD_VER < 2014 && _CCCL_CUDA_COMPILER_NVCC
+
+#  if __check_builtin(builtin_launder) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 700)
+#    define _LIBCUDACXX_LAUNDER(...) __builtin_launder(__VA_ARGS__)
+#  endif // __check_builtin(builtin_launder)
 
 // Disabled due to libstdc++ conflict
-#if 0 // __check_builtin(decay)
-#define _LIBCUDACXX_DECAY(...) __decay(__VA_ARGS__)
-#endif // __check_builtin(decay)
-
-#if __check_builtin(has_nothrow_assign)                      \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 403)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_HAS_NOTHROW_ASSIGN(...) __has_nothrow_assign(__VA_ARGS__)
-#endif // __check_builtin(has_nothrow_assign)
-
-#if __check_builtin(has_nothrow_constructor)                 \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 403)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_HAS_NOTHROW_CONSTRUCTOR(...) __has_nothrow_constructor(__VA_ARGS__)
-#endif // __check_builtin(has_nothrow_constructor)
-
-#if __check_builtin(has_nothrow_copy)                        \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 403)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_HAS_NOTHROW_COPY(...) __has_nothrow_copy(__VA_ARGS__)
-#endif // __check_builtin(has_nothrow_copy)
-
-#if __check_builtin(has_trivial_constructor)                 \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 403)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_HAS_TRIVIAL_CONSTRUCTOR(...) __has_trivial_constructor(__VA_ARGS__)
-#endif // __check_builtin(has_trivial_constructor)
-
-#if __check_builtin(has_trivial_destructor)                  \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 403)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_HAS_TRIVIAL_DESTRUCTOR(...) __has_trivial_destructor(__VA_ARGS__)
-#endif // __check_builtin(has_trivial_destructor)
-
-#if __check_builtin(has_unique_object_representations)       \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 700)
-#define _LIBCUDACXX_HAS_UNIQUE_OBJECT_REPRESENTATIONS(...) __has_unique_object_representations(__VA_ARGS__)
-#endif // __check_builtin(has_unique_object_representations)
-
-#if __check_builtin(has_virtual_destructor)                  \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 403)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_HAS_VIRTUAL_DESTRUCTOR(...) __has_virtual_destructor(__VA_ARGS__)
-#endif // __check_builtin(has_virtual_destructor)
-
-#if __check_builtin(is_aggregate)                            \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 700)       \
- || (defined(_CCCL_COMPILER_MSVC) && _MSC_VER  > 1914)       \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_AGGREGATE(...) __is_aggregate(__VA_ARGS__)
-#endif // __check_builtin(is_aggregate)
-
-#if __check_builtin(is_array)
-#define _LIBCUDACXX_IS_ARRAY(...) __is_array(__VA_ARGS__)
-#endif // __check_builtin(is_array)
+#  if 0 // __check_builtin(decay)
+#    define _LIBCUDACXX_DECAY(...) __decay(__VA_ARGS__)
+#  endif // __check_builtin(decay)
+
+#  if __check_builtin(has_nothrow_assign) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 403) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_HAS_NOTHROW_ASSIGN(...) __has_nothrow_assign(__VA_ARGS__)
+#  endif // __check_builtin(has_nothrow_assign)
+
+#  if __check_builtin(has_nothrow_constructor) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 403) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_HAS_NOTHROW_CONSTRUCTOR(...) __has_nothrow_constructor(__VA_ARGS__)
+#  endif // __check_builtin(has_nothrow_constructor)
+
+#  if __check_builtin(has_nothrow_copy) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 403) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_HAS_NOTHROW_COPY(...) __has_nothrow_copy(__VA_ARGS__)
+#  endif // __check_builtin(has_nothrow_copy)
+
+#  if __check_builtin(has_trivial_constructor) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 403) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_HAS_TRIVIAL_CONSTRUCTOR(...) __has_trivial_constructor(__VA_ARGS__)
+#  endif // __check_builtin(has_trivial_constructor)
+
+#  if __check_builtin(has_trivial_destructor) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 403) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_HAS_TRIVIAL_DESTRUCTOR(...) __has_trivial_destructor(__VA_ARGS__)
+#  endif // __check_builtin(has_trivial_destructor)
+
+#  if __check_builtin(has_unique_object_representations) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 700)
+#    define _LIBCUDACXX_HAS_UNIQUE_OBJECT_REPRESENTATIONS(...) __has_unique_object_representations(__VA_ARGS__)
+#  endif // __check_builtin(has_unique_object_representations)
+
+#  if __check_builtin(has_virtual_destructor) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 403) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_HAS_VIRTUAL_DESTRUCTOR(...) __has_virtual_destructor(__VA_ARGS__)
+#  endif // __check_builtin(has_virtual_destructor)
+
+#  if __check_builtin(is_aggregate) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 700) \
+    || (defined(_CCCL_COMPILER_MSVC) && _MSC_VER > 1914) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_AGGREGATE(...) __is_aggregate(__VA_ARGS__)
+#  endif // __check_builtin(is_aggregate)
+
+#  if __check_builtin(is_array)
+#    define _LIBCUDACXX_IS_ARRAY(...) __is_array(__VA_ARGS__)
+#  endif // __check_builtin(is_array)
 
 // TODO: Clang incorrectly reports that __is_array is true for T[0].
 //       Re-enable the branch once https://llvm.org/PR54705 is fixed.
-#ifndef _LIBCUDACXX_USE_IS_ARRAY_FALLBACK
-#if defined(_CCCL_COMPILER_CLANG)
-#define _LIBCUDACXX_USE_IS_ARRAY_FALLBACK
-#endif // _CCCL_COMPILER_CLANG
-#endif // !_LIBCUDACXX_USE_IS_ARRAY_FALLBACK
-
-#if __check_builtin(is_assignable)     \
- || defined(_CCCL_COMPILER_MSVC)
-#define _LIBCUDACXX_IS_ASSIGNABLE(...) __is_assignable(__VA_ARGS__)
-#endif // __check_builtin(is_assignable)
-
-#if __check_builtin(is_base_of)                              \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 403)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_BASE_OF(...) __is_base_of(__VA_ARGS__)
-#endif // __check_builtin(is_base_of)
-
-#if __check_builtin(is_class)                                \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 403)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_CLASS(...) __is_class(__VA_ARGS__)
-#endif // __check_builtin(is_class)
-
-#if __check_builtin(is_constructible)                       \
- || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 800)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_CONSTRUCTIBLE(...) __is_constructible(__VA_ARGS__)
-#endif // __check_builtin(is_constructible)
-
-#if __check_builtin(is_convertible_to) \
- || defined(_CCCL_COMPILER_MSVC)       \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_CONVERTIBLE_TO(...) __is_convertible_to(__VA_ARGS__)
-#endif // __check_builtin(is_convertible_to)
-
-#if __check_builtin(is_destructible)   \
- || defined(_CCCL_COMPILER_MSVC)
-#define _LIBCUDACXX_IS_DESTRUCTIBLE(...) __is_destructible(__VA_ARGS__)
-#endif // __check_builtin(is_destructible)
-
-#if __check_builtin(is_empty)                                \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 403)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_EMPTY(...) __is_empty(__VA_ARGS__)
-#endif // __check_builtin(is_empty)
-
-#if __check_builtin(is_enum)                                 \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 403)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_ENUM(...) __is_enum(__VA_ARGS__)
-#endif // __check_builtin(is_enum)
-
-#if __check_builtin(is_final)                                \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 407)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_FINAL(...) __is_final(__VA_ARGS__)
-#endif // __check_builtin(is_final)
-
-#if __check_builtin(is_function)       \
- && !defined(_CCCL_CUDA_COMPILER_NVCC)
-#define _LIBCUDACXX_IS_FUNCTION(...) __is_function(__VA_ARGS__)
-#endif // __check_builtin(is_function)
-
-#if __check_builtin(is_literal_type)                         \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 406)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_LITERAL(...) __is_literal_type(__VA_ARGS__)
-#endif // __check_builtin(is_literal_type)
-
-#if __check_builtin(is_lvalue_reference)
-#define _LIBCUDACXX_IS_LVALUE_REFERENCE(...) __is_lvalue_reference(__VA_ARGS__)
-#endif // __check_builtin(is_lvalue_reference)
-
-#ifndef _LIBCUDACXX_USE_IS_LVALUE_REFERENCE_FALLBACK
-#if defined(_CCCL_CUDACC_BELOW_11_3)
-#define _LIBCUDACXX_USE_IS_LVALUE_REFERENCE_FALLBACK
-#endif // nvcc < 11.3
-#endif // !_LIBCUDACXX_USE_IS_LVALUE_REFERENCE_FALLBACK
-
-#if __check_builtin(is_nothrow_assignable) \
- || defined(_CCCL_COMPILER_MSVC)           \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_NOTHROW_ASSIGNABLE(...) __is_nothrow_assignable(__VA_ARGS__)
-#endif // __check_builtin(is_nothrow_assignable)
-
-#if __check_builtin(is_nothrow_constructible) \
- || defined(_CCCL_COMPILER_MSVC)              \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_NOTHROW_CONSTRUCTIBLE(...) __is_nothrow_constructible(__VA_ARGS__)
-#endif // __check_builtin(is_nothrow_constructible)
-
-#if __check_builtin(is_nothrow_destructible) \
- || defined(_CCCL_COMPILER_MSVC)             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_NOTHROW_DESTRUCTIBLE(...) __is_nothrow_destructible(__VA_ARGS__)
-#endif // __check_builtin(is_nothrow_destructible)
-
-#if __check_builtin(is_object)
-#define _LIBCUDACXX_IS_OBJECT(...) __is_object(__VA_ARGS__)
-#endif // __check_builtin(is_object)
-
-#ifndef _LIBCUDACXX_USE_IS_OBJECT_FALLBACK
-#if defined(_CCCL_CUDACC_BELOW_11_3)
-#define _LIBCUDACXX_USE_IS_OBJECT_FALLBACK
-#endif // nvcc < 11.3
-#endif // !_LIBCUDACXX_USE_IS_OBJECT_FALLBACK
-
-#if __check_builtin(is_pod)                                  \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 403)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_POD(...) __is_pod(__VA_ARGS__)
-#endif // __check_builtin(is_pod)
+#  ifndef _LIBCUDACXX_USE_IS_ARRAY_FALLBACK
+#    if defined(_CCCL_COMPILER_CLANG)
+#      define _LIBCUDACXX_USE_IS_ARRAY_FALLBACK
+#    endif // _CCCL_COMPILER_CLANG
+#  endif // !_LIBCUDACXX_USE_IS_ARRAY_FALLBACK
+
+#  if __check_builtin(is_assignable) || defined(_CCCL_COMPILER_MSVC)
+#    define _LIBCUDACXX_IS_ASSIGNABLE(...) __is_assignable(__VA_ARGS__)
+#  endif // __check_builtin(is_assignable)
+
+#  if __check_builtin(is_base_of) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 403) || defined(_CCCL_COMPILER_MSVC) \
+    || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_BASE_OF(...) __is_base_of(__VA_ARGS__)
+#  endif // __check_builtin(is_base_of)
+
+#  if __check_builtin(is_class) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 403) || defined(_CCCL_COMPILER_MSVC) \
+    || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_CLASS(...) __is_class(__VA_ARGS__)
+#  endif // __check_builtin(is_class)
+
+#  if __check_builtin(is_constructible) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 800) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_CONSTRUCTIBLE(...) __is_constructible(__VA_ARGS__)
+#  endif // __check_builtin(is_constructible)
+
+#  if __check_builtin(is_convertible_to) || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_CONVERTIBLE_TO(...) __is_convertible_to(__VA_ARGS__)
+#  endif // __check_builtin(is_convertible_to)
+
+#  if __check_builtin(is_destructible) || defined(_CCCL_COMPILER_MSVC)
+#    define _LIBCUDACXX_IS_DESTRUCTIBLE(...) __is_destructible(__VA_ARGS__)
+#  endif // __check_builtin(is_destructible)
+
+#  if __check_builtin(is_empty) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 403) || defined(_CCCL_COMPILER_MSVC) \
+    || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_EMPTY(...) __is_empty(__VA_ARGS__)
+#  endif // __check_builtin(is_empty)
+
+#  if __check_builtin(is_enum) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 403) || defined(_CCCL_COMPILER_MSVC) \
+    || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_ENUM(...) __is_enum(__VA_ARGS__)
+#  endif // __check_builtin(is_enum)
+
+#  if __check_builtin(is_final) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 407) || defined(_CCCL_COMPILER_MSVC) \
+    || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_FINAL(...) __is_final(__VA_ARGS__)
+#  endif // __check_builtin(is_final)
+
+#  if __check_builtin(is_function) && !defined(_CCCL_CUDA_COMPILER_NVCC)
+#    define _LIBCUDACXX_IS_FUNCTION(...) __is_function(__VA_ARGS__)
+#  endif // __check_builtin(is_function)
+
+#  if __check_builtin(is_literal_type) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 406) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_LITERAL(...) __is_literal_type(__VA_ARGS__)
+#  endif // __check_builtin(is_literal_type)
+
+#  if __check_builtin(is_lvalue_reference)
+#    define _LIBCUDACXX_IS_LVALUE_REFERENCE(...) __is_lvalue_reference(__VA_ARGS__)
+#  endif // __check_builtin(is_lvalue_reference)
+
+#  ifndef _LIBCUDACXX_USE_IS_LVALUE_REFERENCE_FALLBACK
+#    if defined(_CCCL_CUDACC_BELOW_11_3)
+#      define _LIBCUDACXX_USE_IS_LVALUE_REFERENCE_FALLBACK
+#    endif // nvcc < 11.3
+#  endif // !_LIBCUDACXX_USE_IS_LVALUE_REFERENCE_FALLBACK
+
+#  if __check_builtin(is_nothrow_assignable) || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_NOTHROW_ASSIGNABLE(...) __is_nothrow_assignable(__VA_ARGS__)
+#  endif // __check_builtin(is_nothrow_assignable)
+
+#  if __check_builtin(is_nothrow_constructible) || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_NOTHROW_CONSTRUCTIBLE(...) __is_nothrow_constructible(__VA_ARGS__)
+#  endif // __check_builtin(is_nothrow_constructible)
+
+#  if __check_builtin(is_nothrow_destructible) || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_NOTHROW_DESTRUCTIBLE(...) __is_nothrow_destructible(__VA_ARGS__)
+#  endif // __check_builtin(is_nothrow_destructible)
+
+#  if __check_builtin(is_object)
+#    define _LIBCUDACXX_IS_OBJECT(...) __is_object(__VA_ARGS__)
+#  endif // __check_builtin(is_object)
+
+#  ifndef _LIBCUDACXX_USE_IS_OBJECT_FALLBACK
+#    if defined(_CCCL_CUDACC_BELOW_11_3)
+#      define _LIBCUDACXX_USE_IS_OBJECT_FALLBACK
+#    endif // nvcc < 11.3
+#  endif // !_LIBCUDACXX_USE_IS_OBJECT_FALLBACK
+
+#  if __check_builtin(is_pod) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 403) || defined(_CCCL_COMPILER_MSVC) \
+    || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_POD(...) __is_pod(__VA_ARGS__)
+#  endif // __check_builtin(is_pod)
 
 // libstdc++ defines this as a function, breaking functionality
-#if 0 // __check_builtin(is_pointer)
-#define _LIBCUDACXX_IS_POINTER(...) __is_pointer(__VA_ARGS__)
-#endif // __check_builtin(is_pointer)
+#  if 0 // __check_builtin(is_pointer)
+#    define _LIBCUDACXX_IS_POINTER(...) __is_pointer(__VA_ARGS__)
+#  endif // __check_builtin(is_pointer)
 
-#if __check_builtin(is_polymorphic)                          \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 403)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_POLYMORPHIC(...) __is_polymorphic(__VA_ARGS__)
-#endif // __check_builtin(is_polymorphic)
+#  if __check_builtin(is_polymorphic) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 403) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_POLYMORPHIC(...) __is_polymorphic(__VA_ARGS__)
+#  endif // __check_builtin(is_polymorphic)
 
-#if __check_builtin(is_reference)
-#define _LIBCUDACXX_IS_REFERENCE(...) __is_reference(__VA_ARGS__)
-#endif // __check_builtin(is_reference)
+#  if __check_builtin(is_reference)
+#    define _LIBCUDACXX_IS_REFERENCE(...) __is_reference(__VA_ARGS__)
+#  endif // __check_builtin(is_reference)
 
 // Disabled due to libstdc++ conflict
-#if 0 // __check_builtin(is_referenceable)
-#define _LIBCUDACXX_IS_REFERENCEABLE(...) __is_referenceable(__VA_ARGS__)
-#endif // __check_builtin(is_referenceable)
+#  if 0 // __check_builtin(is_referenceable)
+#    define _LIBCUDACXX_IS_REFERENCEABLE(...) __is_referenceable(__VA_ARGS__)
+#  endif // __check_builtin(is_referenceable)
 
-#if __check_builtin(is_rvalue_reference)
-#define _LIBCUDACXX_IS_RVALUE_REFERENCE(...) __is_rvalue_reference(__VA_ARGS__)
-#endif // __check_builtin(is_rvalue_reference)
+#  if __check_builtin(is_rvalue_reference)
+#    define _LIBCUDACXX_IS_RVALUE_REFERENCE(...) __is_rvalue_reference(__VA_ARGS__)
+#  endif // __check_builtin(is_rvalue_reference)
 
-#if __check_builtin(is_same) && !defined(_CCCL_CUDA_COMPILER_NVCC)
-#define _LIBCUDACXX_IS_SAME(...) __is_same(__VA_ARGS__)
-#endif // __check_builtin(is_same)
+#  if __check_builtin(is_same) && !defined(_CCCL_CUDA_COMPILER_NVCC)
+#    define _LIBCUDACXX_IS_SAME(...) __is_same(__VA_ARGS__)
+#  endif // __check_builtin(is_same)
 
 // libstdc++ defines this as a function, breaking functionality
-#if 0 // __check_builtin(is_scalar)
-#define _LIBCUDACXX_IS_SCALAR(...) __is_scalar(__VA_ARGS__)
-#endif // __check_builtin(is_scalar)
+#  if 0 // __check_builtin(is_scalar)
+#    define _LIBCUDACXX_IS_SCALAR(...) __is_scalar(__VA_ARGS__)
+#  endif // __check_builtin(is_scalar)
 
 // libstdc++ defines this as a function, breaking functionality
-#if 0 // __check_builtin(is_signed)
-#define _LIBCUDACXX_IS_SIGNED(...) __is_signed(__VA_ARGS__)
-#endif // __check_builtin(is_signed)
-
-#if __check_builtin(is_standard_layout)                      \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 407)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_STANDARD_LAYOUT(...) __is_standard_layout(__VA_ARGS__)
-#endif // __check_builtin(is_standard_layout)
-
-#if __check_builtin(is_trivial)                              \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 405)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_TRIVIAL(...) __is_trivial(__VA_ARGS__)
-#endif // __check_builtin(is_trivial)
-
-#if __check_builtin(is_trivially_assignable)                 \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 501)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_TRIVIALLY_ASSIGNABLE(...) __is_trivially_assignable(__VA_ARGS__)
-#endif // __check_builtin(is_trivially_assignable)
-
-#if __check_builtin(is_trivially_constructible)              \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 501)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_TRIVIALLY_CONSTRUCTIBLE(...) __is_trivially_constructible(__VA_ARGS__)
-#endif // __check_builtin(is_trivially_constructible)
-
-#if __check_builtin(is_trivially_copyable)                   \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 501)       \
- || defined(_CCCL_COMPILER_MSVC)                             \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_TRIVIALLY_COPYABLE(...) __is_trivially_copyable(__VA_ARGS__)
-#endif // __check_builtin(is_trivially_copyable)
-
-#if __check_builtin(is_trivially_destructible) \
- || defined(_CCCL_COMPILER_MSVC)
-#define _LIBCUDACXX_IS_TRIVIALLY_DESTRUCTIBLE(...) __is_trivially_destructible(__VA_ARGS__)
-#endif // __check_builtin(is_trivially_destructible)
-
-#if __check_builtin(is_union)                                \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 403)       \
- || defined(_CCCL_COMPILER_MSVC)                              \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_IS_UNION(...) __is_union(__VA_ARGS__)
-#endif // __check_builtin(is_union)
-
-#if __check_builtin(is_unsigned)
-#define _LIBCUDACXX_IS_UNSIGNED(...) __is_unsigned(__VA_ARGS__)
-#endif // __check_builtin(is_unsigned)
-
-#ifndef _LIBCUDACXX_USE_IS_UNSIGNED_FALLBACK
-#if defined(_CCCL_CUDACC_BELOW_11_3)
-#define _LIBCUDACXX_USE_IS_UNSIGNED_FALLBACK
-#endif // nvcc < 11.3
-#endif // !_LIBCUDACXX_USE_IS_UNSIGNED_FALLBACK
+#  if 0 // __check_builtin(is_signed)
+#    define _LIBCUDACXX_IS_SIGNED(...) __is_signed(__VA_ARGS__)
+#  endif // __check_builtin(is_signed)
+
+#  if __check_builtin(is_standard_layout) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 407) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_STANDARD_LAYOUT(...) __is_standard_layout(__VA_ARGS__)
+#  endif // __check_builtin(is_standard_layout)
+
+#  if __check_builtin(is_trivial) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 405) || defined(_CCCL_COMPILER_MSVC) \
+    || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_TRIVIAL(...) __is_trivial(__VA_ARGS__)
+#  endif // __check_builtin(is_trivial)
+
+#  if __check_builtin(is_trivially_assignable) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 501) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_TRIVIALLY_ASSIGNABLE(...) __is_trivially_assignable(__VA_ARGS__)
+#  endif // __check_builtin(is_trivially_assignable)
+
+#  if __check_builtin(is_trivially_constructible) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 501) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_TRIVIALLY_CONSTRUCTIBLE(...) __is_trivially_constructible(__VA_ARGS__)
+#  endif // __check_builtin(is_trivially_constructible)
+
+#  if __check_builtin(is_trivially_copyable) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 501) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_TRIVIALLY_COPYABLE(...) __is_trivially_copyable(__VA_ARGS__)
+#  endif // __check_builtin(is_trivially_copyable)
+
+#  if __check_builtin(is_trivially_destructible) || defined(_CCCL_COMPILER_MSVC)
+#    define _LIBCUDACXX_IS_TRIVIALLY_DESTRUCTIBLE(...) __is_trivially_destructible(__VA_ARGS__)
+#  endif // __check_builtin(is_trivially_destructible)
+
+#  if __check_builtin(is_union) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 403) || defined(_CCCL_COMPILER_MSVC) \
+    || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_IS_UNION(...) __is_union(__VA_ARGS__)
+#  endif // __check_builtin(is_union)
+
+#  if __check_builtin(is_unsigned)
+#    define _LIBCUDACXX_IS_UNSIGNED(...) __is_unsigned(__VA_ARGS__)
+#  endif // __check_builtin(is_unsigned)
+
+#  ifndef _LIBCUDACXX_USE_IS_UNSIGNED_FALLBACK
+#    if defined(_CCCL_CUDACC_BELOW_11_3)
+#      define _LIBCUDACXX_USE_IS_UNSIGNED_FALLBACK
+#    endif // nvcc < 11.3
+#  endif // !_LIBCUDACXX_USE_IS_UNSIGNED_FALLBACK
 
 // libstdc++ defines this as a function, breaking functionality
-#if 0 // __check_builtin(is_void)
-#define _LIBCUDACXX_IS_VOID(...) __is_void(__VA_ARGS__)
-#endif // __check_builtin(is_void)
+#  if 0 // __check_builtin(is_void)
+#    define _LIBCUDACXX_IS_VOID(...) __is_void(__VA_ARGS__)
+#  endif // __check_builtin(is_void)
 
 // Disabled due to libstdc++ conflict
-#if 0 // __check_builtin(make_signed)
-#define _LIBCUDACXX_MAKE_SIGNED(...) __make_signed(__VA_ARGS__)
-#endif // __check_builtin(make_signed)
+#  if 0 // __check_builtin(make_signed)
+#    define _LIBCUDACXX_MAKE_SIGNED(...) __make_signed(__VA_ARGS__)
+#  endif // __check_builtin(make_signed)
 
 // Disabled due to libstdc++ conflict
-#if 0 // __check_builtin(make_unsigned)
-#define _LIBCUDACXX_MAKE_UNSIGNED(...) __make_unsigned(__VA_ARGS__)
-#endif // __check_builtin(make_unsigned)
+#  if 0 // __check_builtin(make_unsigned)
+#    define _LIBCUDACXX_MAKE_UNSIGNED(...) __make_unsigned(__VA_ARGS__)
+#  endif // __check_builtin(make_unsigned)
 
 // Disabled due to libstdc++ conflict
-#if 0 // __check_builtin(remove_all_extents)
-#define _LIBCUDACXX_REMOVE_ALL_EXTENTS(...) __remove_all_extents(__VA_ARGS__)
-#endif // __check_builtin(remove_all_extents)
+#  if 0 // __check_builtin(remove_all_extents)
+#    define _LIBCUDACXX_REMOVE_ALL_EXTENTS(...) __remove_all_extents(__VA_ARGS__)
+#  endif // __check_builtin(remove_all_extents)
 
 // Disabled due to libstdc++ conflict
-#if 0 // __check_builtin(remove_const)
-#define _LIBCUDACXX_REMOVE_CONST(...) __remove_const(__VA_ARGS__)
-#endif // __check_builtin(remove_const)
+#  if 0 // __check_builtin(remove_const)
+#    define _LIBCUDACXX_REMOVE_CONST(...) __remove_const(__VA_ARGS__)
+#  endif // __check_builtin(remove_const)
 
 // Disabled due to libstdc++ conflict
-#if 0 // __check_builtin(remove_cv)
-#define _LIBCUDACXX_REMOVE_CV(...) __remove_cv(__VA_ARGS__)
-#endif // __check_builtin(remove_cv)
+#  if 0 // __check_builtin(remove_cv)
+#    define _LIBCUDACXX_REMOVE_CV(...) __remove_cv(__VA_ARGS__)
+#  endif // __check_builtin(remove_cv)
 
 // Disabled due to libstdc++ conflict
-#if 0 // __check_builtin(remove_cvref)
-#define _LIBCUDACXX_REMOVE_CVREF(...) __remove_cvref(__VA_ARGS__)
-#endif // __check_builtin(remove_cvref)
+#  if 0 // __check_builtin(remove_cvref)
+#    define _LIBCUDACXX_REMOVE_CVREF(...) __remove_cvref(__VA_ARGS__)
+#  endif // __check_builtin(remove_cvref)
 
 // Disabled due to libstdc++ conflict
-#if 0 // __check_builtin(remove_extent)
-#define _LIBCUDACXX_REMOVE_EXTENT(...) __remove_extent(__VA_ARGS__)
-#endif // __check_builtin(remove_extent)
+#  if 0 // __check_builtin(remove_extent)
+#    define _LIBCUDACXX_REMOVE_EXTENT(...) __remove_extent(__VA_ARGS__)
+#  endif // __check_builtin(remove_extent)
 
 // Disabled due to libstdc++ conflict
-#if 0 // __check_builtin(remove_pointer)
-#define _LIBCUDACXX_REMOVE_POINTER(...) __remove_pointer(__VA_ARGS__)
-#endif // __check_builtin(remove_pointer)
+#  if 0 // __check_builtin(remove_pointer)
+#    define _LIBCUDACXX_REMOVE_POINTER(...) __remove_pointer(__VA_ARGS__)
+#  endif // __check_builtin(remove_pointer)
 
 // Disabled due to libstdc++ conflict
-#if 0 // __check_builtin(remove_reference_t)
-#define _LIBCUDACXX_REMOVE_REFERENCE_T(...) __remove_reference_t(__VA_ARGS__)
-#endif // __check_builtin(remove_reference_t)
+#  if 0 // __check_builtin(remove_reference_t)
+#    define _LIBCUDACXX_REMOVE_REFERENCE_T(...) __remove_reference_t(__VA_ARGS__)
+#  endif // __check_builtin(remove_reference_t)
 
 // Disabled due to libstdc++ conflict
-#if 0 // __check_builtin(remove_volatile)
-#define _LIBCUDACXX_REMOVE_VOLATILE(...) __remove_volatile(__VA_ARGS__)
-#endif // __check_builtin(remove_volatile)
+#  if 0 // __check_builtin(remove_volatile)
+#    define _LIBCUDACXX_REMOVE_VOLATILE(...) __remove_volatile(__VA_ARGS__)
+#  endif // __check_builtin(remove_volatile)
 
-#if __check_builtin(underlying_type)                         \
- || (defined(_CCCL_COMPILER_GCC)  && _GNUC_VER >= 407) \
- || defined(_CCCL_COMPILER_MSVC)                       \
- || defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_UNDERLYING_TYPE(...) __underlying_type(__VA_ARGS__)
-#endif // __check_builtin(underlying_type)
+#  if __check_builtin(underlying_type) || (defined(_CCCL_COMPILER_GCC) && _GNUC_VER >= 407) \
+    || defined(_CCCL_COMPILER_MSVC) || defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_UNDERLYING_TYPE(...) __underlying_type(__VA_ARGS__)
+#  endif // __check_builtin(underlying_type)
 
-#if defined(_CCCL_COMPILER_CLANG)
+#  if defined(_CCCL_COMPILER_CLANG)
 
 // _LIBCUDACXX_ALTERNATE_STRING_LAYOUT is an old name for
 // _LIBCUDACXX_ABI_ALTERNATE_STRING_LAYOUT left here for backward compatibility.
-#if defined(_LIBCUDACXX_ALTERNATE_STRING_LAYOUT)
-#define _LIBCUDACXX_ABI_ALTERNATE_STRING_LAYOUT
-#endif
+#    if defined(_LIBCUDACXX_ALTERNATE_STRING_LAYOUT)
+#      define _LIBCUDACXX_ABI_ALTERNATE_STRING_LAYOUT
+#    endif
 
-#if __cplusplus < 201103L
+#    if __cplusplus < 201103L
 typedef __char16_t char16_t;
 typedef __char32_t char32_t;
-#endif
+#    endif
 
-#if !(__has_feature(cxx_strong_enums))
-#define _LIBCUDACXX_HAS_NO_STRONG_ENUMS
-#endif
+#    if !(__has_feature(cxx_strong_enums))
+#      define _LIBCUDACXX_HAS_NO_STRONG_ENUMS
+#    endif
 
-#if !(__has_feature(cxx_lambdas))
-#define _LIBCUDACXX_HAS_NO_LAMBDAS
-#endif
+#    if !(__has_feature(cxx_lambdas))
+#      define _LIBCUDACXX_HAS_NO_LAMBDAS
+#    endif
 
-#if !(__has_feature(cxx_nullptr))
-#  if (__has_extension(cxx_nullptr) || __has_keyword(__nullptr)) && defined(_LIBCUDACXX_ABI_ALWAYS_USE_CXX11_NULLPTR)
-#    define nullptr __nullptr
-#  else
-#    define _LIBCUDACXX_HAS_NO_NULLPTR
-#  endif
-#endif
+#    if !(__has_feature(cxx_nullptr))
+#      if (__has_extension(cxx_nullptr) || __has_keyword(__nullptr)) \
+        && defined(_LIBCUDACXX_ABI_ALWAYS_USE_CXX11_NULLPTR)
+#        define nullptr __nullptr
+#      else
+#        define _LIBCUDACXX_HAS_NO_NULLPTR
+#      endif
+#    endif
 
-#if !(__has_feature(cxx_rvalue_references))
-#define _LIBCUDACXX_HAS_NO_RVALUE_REFERENCES
-#endif
+#    if !(__has_feature(cxx_rvalue_references))
+#      define _LIBCUDACXX_HAS_NO_RVALUE_REFERENCES
+#    endif
 
-#if !(__has_feature(cxx_auto_type))
-#define _LIBCUDACXX_HAS_NO_AUTO_TYPE
-#endif
+#    if !(__has_feature(cxx_auto_type))
+#      define _LIBCUDACXX_HAS_NO_AUTO_TYPE
+#    endif
 
-#if !(__has_feature(cxx_variadic_templates))
-#define _LIBCUDACXX_HAS_NO_VARIADICS
-#endif
+#    if !(__has_feature(cxx_variadic_templates))
+#      define _LIBCUDACXX_HAS_NO_VARIADICS
+#    endif
 
-#if !(__has_feature(cxx_generalized_initializers))
-#define _LIBCUDACXX_HAS_NO_GENERALIZED_INITIALIZERS
-#endif
+#    if !(__has_feature(cxx_generalized_initializers))
+#      define _LIBCUDACXX_HAS_NO_GENERALIZED_INITIALIZERS
+#    endif
 
 // Objective-C++ features (opt-in)
-#if __has_feature(objc_arc)
-#define _LIBCUDACXX_HAS_OBJC_ARC
-#endif
+#    if __has_feature(objc_arc)
+#      define _LIBCUDACXX_HAS_OBJC_ARC
+#    endif
 
-#if __has_feature(objc_arc_weak)
-#define _LIBCUDACXX_HAS_OBJC_ARC_WEAK
-#endif
+#    if __has_feature(objc_arc_weak)
+#      define _LIBCUDACXX_HAS_OBJC_ARC_WEAK
+#    endif
 
-#if !(__has_feature(cxx_variable_templates))
-#define _LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES
-#endif
+#    if !(__has_feature(cxx_variable_templates))
+#      define _LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES
+#    endif
 
-#if !(__has_feature(cxx_noexcept))
-#define _LIBCUDACXX_HAS_NO_NOEXCEPT
-#endif
+#    if !(__has_feature(cxx_noexcept))
+#      define _LIBCUDACXX_HAS_NO_NOEXCEPT
+#    endif
 
 // Allow for build-time disabling of unsigned integer sanitization
-#if !defined(_LIBCUDACXX_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK) && __has_attribute(no_sanitize)
-#define _LIBCUDACXX_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK __attribute__((__no_sanitize__("unsigned-integer-overflow")))
-#endif
+#    if !defined(_LIBCUDACXX_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK) && __has_attribute(no_sanitize)
+#      define _LIBCUDACXX_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK \
+        __attribute__((__no_sanitize__("unsigned-integer-overflow")))
+#    endif
 
-#define _LIBCUDACXX_DISABLE_EXTENSION_WARNING __extension__
+#    define _LIBCUDACXX_DISABLE_EXTENSION_WARNING __extension__
 
-#elif defined(_CCCL_COMPILER_GCC)
+#  elif defined(_CCCL_COMPILER_GCC)
 
-#ifndef _LIBCUDACXX_USE_IS_ASSIGNABLE_FALLBACK
+#    ifndef _LIBCUDACXX_USE_IS_ASSIGNABLE_FALLBACK
 // FIXME: GCC 8.0 supports this trait, but it has a bug.
 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91592
 // https://godbolt.org/z/IljfIw
-#define _LIBCUDACXX_USE_IS_ASSIGNABLE_FALLBACK
-#endif // _LIBCUDACXX_USE_IS_ASSIGNABLE_FALLBACK
+#      define _LIBCUDACXX_USE_IS_ASSIGNABLE_FALLBACK
+#    endif // _LIBCUDACXX_USE_IS_ASSIGNABLE_FALLBACK
 
 // GCC 5 supports variable templates
-#if !defined(__cpp_variable_templates) || __cpp_variable_templates < 201304L
-#define _LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES
-#endif
+#    if !defined(__cpp_variable_templates) || __cpp_variable_templates < 201304L
+#      define _LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES
+#    endif
 
-#if _GNUC_VER < 600
-#define _LIBCUDACXX_GCC_MATH_IN_STD
-#endif
+#    if _GNUC_VER < 600
+#      define _LIBCUDACXX_GCC_MATH_IN_STD
+#    endif
 
 // NVCC cannot properly handle some deductions occuring within NOEXCEPT
 // C++17 mode causes reference instatiation errors in tuple
-#if (_GNUC_VER >= 702 && _GNUC_VER <= 805)
-#if defined(_CCCL_CUDA_COMPILER_NVCC) && _CCCL_STD_VER == 2017
-#define _LIBCUDACXX_NO_TUPLE_NOEXCEPT
-#endif
-#endif
+#    if (_GNUC_VER >= 702 && _GNUC_VER <= 805)
+#      if defined(_CCCL_CUDA_COMPILER_NVCC) && _CCCL_STD_VER == 2017
+#        define _LIBCUDACXX_NO_TUPLE_NOEXCEPT
+#      endif
+#    endif
 
-#define _LIBCUDACXX_DISABLE_EXTENSION_WARNING __extension__
+#    define _LIBCUDACXX_DISABLE_EXTENSION_WARNING __extension__
 
-#elif defined(_CCCL_COMPILER_MSVC)
+#  elif defined(_CCCL_COMPILER_MSVC)
 
-#define _LIBCUDACXX_WARNING(x) __pragma(message(__FILE__ "(" _LIBCUDACXX_TOSTRING(__LINE__) ") : warning note: " x))
+#    define _LIBCUDACXX_WARNING(x)             __pragma(message(__FILE__ "(" _LIBCUDACXX_TOSTRING(__LINE__) ") : warning note: " x))
 
 // https://github.com/microsoft/STL/blob/master/stl/inc/yvals_core.h#L353
 // warning C4100: 'quack': unreferenced formal parameter
@@ -946,143 +878,125 @@ typedef __char32_t char32_t;
 // warning C4668: 'meow' is not defined as a preprocessor macro, replacing with '0' for '#if/#elif'
 // warning C4800: 'boo': forcing value to bool 'true' or 'false' (performance warning)
 // warning C4996: 'meow': was declared deprecated
-#define _LIBCUDACXX_MSVC_DISABLED_WARNINGS \
-  4100 \
-  4127 \
-  4180 \
-  4197 \
-  4296 \
-  4324 \
-  4455 \
-  4503 \
-  4522 \
-  4668 \
-  4800 \
-  4996 \
-  /**/
-
-#if _MSC_VER < 1900
-#error "MSVC versions prior to Visual Studio 2015 are not supported"
-#endif
+#    define _LIBCUDACXX_MSVC_DISABLED_WARNINGS 4100 4127 4180 4197 4296 4324 4455 4503 4522 4668 4800 4996 /**/
+
+#    if _MSC_VER < 1900
+#      error "MSVC versions prior to Visual Studio 2015 are not supported"
+#    endif
 
 // MSVC implemented P0030R1 in 15.7, only available under C++17
-#if _MSC_VER < 1914
-#define _LIBCUDACXX_NO_HOST_CPP17_HYPOT
-#endif
+#    if _MSC_VER < 1914
+#      define _LIBCUDACXX_NO_HOST_CPP17_HYPOT
+#    endif
 
-#if _MSC_VER < 1920
-#define _LIBCUDACXX_HAS_NO_NOEXCEPT_SFINAE
-#define _LIBCUDACXX_HAS_NO_LOGICAL_METAFUNCTION_ALIASES
-#endif
+#    if _MSC_VER < 1920
+#      define _LIBCUDACXX_HAS_NO_NOEXCEPT_SFINAE
+#      define _LIBCUDACXX_HAS_NO_LOGICAL_METAFUNCTION_ALIASES
+#    endif
 
 // MSVC exposed __iso_volatile intrinsics beginning on 1924 for x86
-#if _MSC_VER < 1924
-    #define _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
-#endif
+#    if _MSC_VER < 1924
+#      define _LIBCUDACXX_MSVC_HAS_NO_ISO_INTRIN
+#    endif
 
-#if _CCCL_STD_VER < 2014
-#define _LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES
-#endif
+#    if _CCCL_STD_VER < 2014
+#      define _LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES
+#    endif
 
-#define _LIBCUDACXX_WEAK
+#    define _LIBCUDACXX_WEAK
 
-#define _LIBCUDACXX_HAS_NO_VECTOR_EXTENSION
+#    define _LIBCUDACXX_HAS_NO_VECTOR_EXTENSION
 
-#define _LIBCUDACXX_DISABLE_EXTENSION_WARNING
+#    define _LIBCUDACXX_DISABLE_EXTENSION_WARNING
 
-#elif defined(_CCCL_COMPILER_IBM)
+#  elif defined(_CCCL_COMPILER_IBM)
 
-#define _ATTRIBUTE(x) __attribute__((x))
+#    define _ATTRIBUTE(x) __attribute__((x))
 
-#define _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-#define _LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES
+#    define _LIBCUDACXX_HAS_NO_UNICODE_CHARS
+#    define _LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES
 
-#if defined(_AIX)
-#define __MULTILOCALE_API
-#endif
+#    if defined(_AIX)
+#      define __MULTILOCALE_API
+#    endif
 
-#define _LIBCUDACXX_HAS_NO_VECTOR_EXTENSION
+#    define _LIBCUDACXX_HAS_NO_VECTOR_EXTENSION
 
-#elif defined(_CCCL_COMPILER_NVRTC) || defined(_CCCL_COMPILER_NVHPC)
+#  elif defined(_CCCL_COMPILER_NVRTC) || defined(_CCCL_COMPILER_NVHPC)
 
-#if !defined(__cpp_variable_templates) || __cpp_variable_templates < 201304L
-#define _LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES
-#endif
+#    if !defined(__cpp_variable_templates) || __cpp_variable_templates < 201304L
+#      define _LIBCUDACXX_HAS_NO_VARIABLE_TEMPLATES
+#    endif
 
-#define _LIBCUDACXX_DISABLE_EXTENSION_WARNING
+#    define _LIBCUDACXX_DISABLE_EXTENSION_WARNING
 
-#endif // _CCCL_COMPILER_[CLANG|GCC|MSVC|IBM|NVRTC]
+#  endif // _CCCL_COMPILER_[CLANG|GCC|MSVC|IBM|NVRTC]
 
-#if defined(_CCCL_COMPILER_NVHPC) && !defined(__cuda_std__)
+#  if defined(_CCCL_COMPILER_NVHPC) && !defined(__cuda_std__)
 // Forcefully disable visibility controls when used as the standard library with NVC++.
 // TODO: reevaluate.
-#define _LIBCUDACXX_HIDE_FROM_ABI
-#ifndef _LIBCUDACXX_DISABLE_EXTERN_TEMPLATE
-#define _LIBCUDACXX_DISABLE_EXTERN_TEMPLATE
-#endif
-#endif
-
-#ifndef _LIBCUDACXX_FREESTANDING
-#if defined(__cuda_std__)     \
- || !defined(__STDC_HOSTED__)
-#  define _LIBCUDACXX_FREESTANDING
-#endif
-#endif // !_LIBCUDACXX_FREESTANDING
+#    define _LIBCUDACXX_HIDE_FROM_ABI
+#    ifndef _LIBCUDACXX_DISABLE_EXTERN_TEMPLATE
+#      define _LIBCUDACXX_DISABLE_EXTERN_TEMPLATE
+#    endif
+#  endif
 
-#ifndef _LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS
-#if defined(_CCCL_COMPILER_NVRTC) \
- || (defined(_CCCL_COMPILER_NVHPC) && !defined(__cuda_std__))
-#  define _LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS
-#endif
-#endif // _LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS
+#  ifndef _LIBCUDACXX_FREESTANDING
+#    if defined(__cuda_std__) || !defined(__STDC_HOSTED__)
+#      define _LIBCUDACXX_FREESTANDING
+#    endif
+#  endif // !_LIBCUDACXX_FREESTANDING
 
-#ifndef _LIBCUDACXX_HAS_CUDA_ATOMIC_EXT
-#if defined(__cuda_std__)
-#  define _LIBCUDACXX_HAS_CUDA_ATOMIC_EXT
-#endif
-#endif // _LIBCUDACXX_HAS_CUDA_ATOMIC_EXT
+#  ifndef _LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS
+#    if defined(_CCCL_COMPILER_NVRTC) || (defined(_CCCL_COMPILER_NVHPC) && !defined(__cuda_std__))
+#      define _LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS
+#    endif
+#  endif // _LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS
 
-#ifndef _LIBCUDACXX_HAS_EXTERNAL_ATOMIC_IMP
-#if defined(__cuda_std__)
-#  define _LIBCUDACXX_HAS_EXTERNAL_ATOMIC_IMP
-#endif
-#endif // _LIBCUDACXX_HAS_EXTERNAL_ATOMIC_IMP
+#  ifndef _LIBCUDACXX_HAS_CUDA_ATOMIC_EXT
+#    if defined(__cuda_std__)
+#      define _LIBCUDACXX_HAS_CUDA_ATOMIC_EXT
+#    endif
+#  endif // _LIBCUDACXX_HAS_CUDA_ATOMIC_EXT
 
-#ifndef _LIBCUDACXX_HAS_NO_ASAN
-#if defined(_CCCL_COMPILER_GCC)
-#  if !defined(__SANITIZE_ADDRESS__)
-#    define _LIBCUDACXX_HAS_NO_ASAN
-#  endif // !__SANITIZE_ADDRESS__
-#elif defined(_CCCL_COMPILER_CLANG)
-#  if !__has_feature(address_sanitizer)
-#    define _LIBCUDACXX_HAS_NO_ASAN
-#  endif // !__has_feature(address_sanitizer)
-#else
-#  define _LIBCUDACXX_HAS_NO_ASAN
-#endif // _CCCL_COMPILER[MSVC|IBM|NVHPC|NVRTC]
-#endif // _LIBCUDACXX_HAS_NO_ASAN
-
-#ifndef _LIBCUDACXX_HAS_NO_CXX20_CHRONO_LITERALS
-#if defined(__cuda_std__) \
- || (defined(_CCCL_COMPILER_CLANG) && _LIBCUDACXX_CLANG_VER < 800)
-#  define _LIBCUDACXX_HAS_NO_CXX20_CHRONO_LITERALS
-#endif // __cuda_std__
-#endif // _LIBCUDACXX_HAS_NO_CXX20_CHRONO_LITERALS
-
-#ifndef _LIBCUDACXX_HAS_NO_INT128
-#if defined(_CCCL_COMPILER_MSVC)                                          \
- || (defined(_CCCL_COMPILER_NVRTC) && !defined(__CUDACC_RTC_INT128__))    \
- || (defined(_CCCL_CUDA_COMPILER_NVCC)  && (_CCCL_CUDACC_VER < 1105000)) \
- || !defined(__SIZEOF_INT128__)
-#  define _LIBCUDACXX_HAS_NO_INT128
-#endif
-#endif // !_LIBCUDACXX_HAS_NO_INT128
+#  ifndef _LIBCUDACXX_HAS_EXTERNAL_ATOMIC_IMP
+#    if defined(__cuda_std__)
+#      define _LIBCUDACXX_HAS_EXTERNAL_ATOMIC_IMP
+#    endif
+#  endif // _LIBCUDACXX_HAS_EXTERNAL_ATOMIC_IMP
+
+#  ifndef _LIBCUDACXX_HAS_NO_ASAN
+#    if defined(_CCCL_COMPILER_GCC)
+#      if !defined(__SANITIZE_ADDRESS__)
+#        define _LIBCUDACXX_HAS_NO_ASAN
+#      endif // !__SANITIZE_ADDRESS__
+#    elif defined(_CCCL_COMPILER_CLANG)
+#      if !__has_feature(address_sanitizer)
+#        define _LIBCUDACXX_HAS_NO_ASAN
+#      endif // !__has_feature(address_sanitizer)
+#    else
+#      define _LIBCUDACXX_HAS_NO_ASAN
+#    endif // _CCCL_COMPILER[MSVC|IBM|NVHPC|NVRTC]
+#  endif // _LIBCUDACXX_HAS_NO_ASAN
+
+#  ifndef _LIBCUDACXX_HAS_NO_CXX20_CHRONO_LITERALS
+#    if defined(__cuda_std__) || (defined(_CCCL_COMPILER_CLANG) && _LIBCUDACXX_CLANG_VER < 800)
+#      define _LIBCUDACXX_HAS_NO_CXX20_CHRONO_LITERALS
+#    endif // __cuda_std__
+#  endif // _LIBCUDACXX_HAS_NO_CXX20_CHRONO_LITERALS
+
+#  ifndef _LIBCUDACXX_HAS_NO_INT128
+#    if defined(_CCCL_COMPILER_MSVC) || (defined(_CCCL_COMPILER_NVRTC) && !defined(__CUDACC_RTC_INT128__)) \
+      || (defined(_CCCL_CUDA_COMPILER_NVCC) && (_CCCL_CUDACC_VER < 1105000)) || !defined(__SIZEOF_INT128__)
+#      define _LIBCUDACXX_HAS_NO_INT128
+#    endif
+#  endif // !_LIBCUDACXX_HAS_NO_INT128
 
-#ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
-#if defined(_CCCL_CUDACC)
-#  define _LIBCUDACXX_HAS_NO_LONG_DOUBLE
-#endif
-#endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#  ifndef _LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#    if defined(_CCCL_CUDACC)
+#      define _LIBCUDACXX_HAS_NO_LONG_DOUBLE
+#    endif
+#  endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE
 
 #  ifndef _LIBCUDACXX_HAS_NVFP16
 #    if __has_include(<cuda_fp16.h>)                                                           \
@@ -1103,43 +1017,42 @@ typedef __char32_t char32_t;
 #    endif
 #  endif // !_LIBCUDACXX_HAS_NVBF16
 
-#ifndef _LIBCUDACXX_HAS_NO_MONOTONIC_CLOCK
-#if defined(__cuda_std__)
-#  define _LIBCUDACXX_HAS_NO_MONOTONIC_CLOCK
-#endif
-#endif // _LIBCUDACXX_HAS_NO_MONOTONIC_CLOCK
+#  ifndef _LIBCUDACXX_HAS_NO_MONOTONIC_CLOCK
+#    if defined(__cuda_std__)
+#      define _LIBCUDACXX_HAS_NO_MONOTONIC_CLOCK
+#    endif
+#  endif // _LIBCUDACXX_HAS_NO_MONOTONIC_CLOCK
 
-#ifndef _LIBCUDACXX_HAS_NO_PLATFORM_WAIT
-#if defined(__cuda_std__)
-#  define _LIBCUDACXX_HAS_NO_PLATFORM_WAIT
-#endif
-#endif // _LIBCUDACXX_HAS_NO_PLATFORM_WAIT
+#  ifndef _LIBCUDACXX_HAS_NO_PLATFORM_WAIT
+#    if defined(__cuda_std__)
+#      define _LIBCUDACXX_HAS_NO_PLATFORM_WAIT
+#    endif
+#  endif // _LIBCUDACXX_HAS_NO_PLATFORM_WAIT
 
-#ifndef _LIBCUDACXX_HAS_NO_PRAGMA_PUSH_POP_MACRO
-#if (defined(_CCCL_COMPILER_MSVC) && _MSC_VER < 1920) \
- || defined(_CCCL_COMPILER_NVRTC)                     \
- || defined(_CCCL_COMPILER_IBM)
-#define _LIBCUDACXX_HAS_NO_PRAGMA_PUSH_POP_MACRO
-#endif
-#endif // _LIBCUDACXX_HAS_NO_PRAGMA_PUSH_POP_MACRO
+#  ifndef _LIBCUDACXX_HAS_NO_PRAGMA_PUSH_POP_MACRO
+#    if (defined(_CCCL_COMPILER_MSVC) && _MSC_VER < 1920) || defined(_CCCL_COMPILER_NVRTC) \
+      || defined(_CCCL_COMPILER_IBM)
+#      define _LIBCUDACXX_HAS_NO_PRAGMA_PUSH_POP_MACRO
+#    endif
+#  endif // _LIBCUDACXX_HAS_NO_PRAGMA_PUSH_POP_MACRO
 
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-#if defined(__cuda_std__)
-#  define _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-#endif
-#endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+#    if defined(__cuda_std__)
+#      define _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+#    endif
+#  endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
 
-#ifndef _LIBCUDACXX_HAS_NO_TREE_BARRIER
-#if defined(__cuda_std__)
-#  define _LIBCUDACXX_HAS_NO_TREE_BARRIER
-#endif
-#endif // _LIBCUDACXX_HAS_NO_TREE_BARRIER
+#  ifndef _LIBCUDACXX_HAS_NO_TREE_BARRIER
+#    if defined(__cuda_std__)
+#      define _LIBCUDACXX_HAS_NO_TREE_BARRIER
+#    endif
+#  endif // _LIBCUDACXX_HAS_NO_TREE_BARRIER
 
-#ifndef _LIBCUDACXX_HAS_NO_WCHAR_H
-#if defined(__cuda_std__)
-#  define _LIBCUDACXX_HAS_NO_WCHAR_H
-#endif
-#endif // _LIBCUDACXX_HAS_NO_WCHAR_H
+#  ifndef _LIBCUDACXX_HAS_NO_WCHAR_H
+#    if defined(__cuda_std__)
+#      define _LIBCUDACXX_HAS_NO_WCHAR_H
+#    endif
+#  endif // _LIBCUDACXX_HAS_NO_WCHAR_H
 
 #  ifndef _LIBCUDACXX_NO_EXCEPTIONS
 #    if !defined(LIBCUDACXX_ENABLE_EXCEPTIONS) || (defined(_CCCL_COMPILER_MSVC) && _HAS_EXCEPTIONS == 0) \
@@ -1150,405 +1063,508 @@ typedef __char32_t char32_t;
 
 // Try to find out if RTTI is disabled.
 // g++ and cl.exe have RTTI on by default and define a macro when it is.
-#ifndef _LIBCUDACXX_NO_RTTI
-#if defined(__cuda_std__)                                               \
- || (defined(_CCCL_COMPILER_CLANG) && !(__has_feature(cxx_rtti))) \
- || (defined(_CCCL_COMPILER_GCC)   && !defined(__GXX_RTTI))       \
- || (defined(_CCCL_COMPILER_MSVC)  && !defined(_CPPRTTI))
-#  define _LIBCUDACXX_NO_RTTI
-#endif
-#endif // !_LIBCUDACXX_NO_RTTI
-
-#ifndef _LIBCUDACXX_NODEBUG_TYPE
-#if defined(__cuda_std__)
-#  define _LIBCUDACXX_NODEBUG_TYPE
-#elif __has_attribute(__nodebug__)                                         \
- && (defined(_CCCL_COMPILER_CLANG) && _LIBCUDACXX_CLANG_VER >= 1210)
-#  define _LIBCUDACXX_NODEBUG_TYPE __attribute__((nodebug))
-#else
-#  define _LIBCUDACXX_NODEBUG_TYPE
-#endif
-#endif // !_LIBCUDACXX_NODEBUG_TYPE
+#  ifndef _LIBCUDACXX_NO_RTTI
+#    if defined(__cuda_std__) || (defined(_CCCL_COMPILER_CLANG) && !(__has_feature(cxx_rtti))) \
+      || (defined(_CCCL_COMPILER_GCC) && !defined(__GXX_RTTI)) || (defined(_CCCL_COMPILER_MSVC) && !defined(_CPPRTTI))
+#      define _LIBCUDACXX_NO_RTTI
+#    endif
+#  endif // !_LIBCUDACXX_NO_RTTI
 
-#if defined(_LIBCUDACXX_OBJECT_FORMAT_COFF)
+#  ifndef _LIBCUDACXX_NODEBUG_TYPE
+#    if defined(__cuda_std__)
+#      define _LIBCUDACXX_NODEBUG_TYPE
+#    elif __has_attribute(__nodebug__) && (defined(_CCCL_COMPILER_CLANG) && _LIBCUDACXX_CLANG_VER >= 1210)
+#      define _LIBCUDACXX_NODEBUG_TYPE __attribute__((nodebug))
+#    else
+#      define _LIBCUDACXX_NODEBUG_TYPE
+#    endif
+#  endif // !_LIBCUDACXX_NODEBUG_TYPE
 
-#ifdef _DLL
-#  define _LIBCUDACXX_CRT_FUNC __declspec(dllimport)
-#else
-#  define _LIBCUDACXX_CRT_FUNC
-#endif
+#  if defined(_LIBCUDACXX_OBJECT_FORMAT_COFF)
 
-#if defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#  define _LIBCUDACXX_DLL_VIS
-#  define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS
-#  define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
-#  define _LIBCUDACXX_OVERRIDABLE_FUNC_VIS
-#  define _LIBCUDACXX_EXPORTED_FROM_ABI
-#elif defined(_LIBCUDACXX_BUILDING_LIBRARY)
-#  define _LIBCUDACXX_DLL_VIS __declspec(dllexport)
-#  if defined(__MINGW32__)
-#    define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS _LIBCUDACXX_DLL_VIS
-#    define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
-#  else
-#    define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS
-#    define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS _LIBCUDACXX_DLL_VIS
-#  endif
-#  define _LIBCUDACXX_OVERRIDABLE_FUNC_VIS _LIBCUDACXX_DLL_VIS
-#  define _LIBCUDACXX_EXPORTED_FROM_ABI __declspec(dllexport)
-#else
-#  define _LIBCUDACXX_DLL_VIS __declspec(dllimport)
-#  define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS _LIBCUDACXX_DLL_VIS
-#  define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
-#  define _LIBCUDACXX_OVERRIDABLE_FUNC_VIS
-#  define _LIBCUDACXX_EXPORTED_FROM_ABI __declspec(dllimport)
-#endif
-
-#define _LIBCUDACXX_TYPE_VIS            _LIBCUDACXX_DLL_VIS
-#define _LIBCUDACXX_FUNC_VIS            _LIBCUDACXX_DLL_VIS
-#define _LIBCUDACXX_EXCEPTION_ABI       _LIBCUDACXX_DLL_VIS
-#define _LIBCUDACXX_HIDDEN
-#define _LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
-#define _LIBCUDACXX_TEMPLATE_VIS
-#define _LIBCUDACXX_ENUM_VIS
+#    ifdef _DLL
+#      define _LIBCUDACXX_CRT_FUNC __declspec(dllimport)
+#    else
+#      define _LIBCUDACXX_CRT_FUNC
+#    endif
 
-#endif // defined(_LIBCUDACXX_OBJECT_FORMAT_COFF)
+#    if defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
+#      define _LIBCUDACXX_DLL_VIS
+#      define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS
+#      define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
+#      define _LIBCUDACXX_OVERRIDABLE_FUNC_VIS
+#      define _LIBCUDACXX_EXPORTED_FROM_ABI
+#    elif defined(_LIBCUDACXX_BUILDING_LIBRARY)
+#      define _LIBCUDACXX_DLL_VIS __declspec(dllexport)
+#      if defined(__MINGW32__)
+#        define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS _LIBCUDACXX_DLL_VIS
+#        define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
+#      else
+#        define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS
+#        define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS _LIBCUDACXX_DLL_VIS
+#      endif
+#      define _LIBCUDACXX_OVERRIDABLE_FUNC_VIS _LIBCUDACXX_DLL_VIS
+#      define _LIBCUDACXX_EXPORTED_FROM_ABI    __declspec(dllexport)
+#    else
+#      define _LIBCUDACXX_DLL_VIS                  __declspec(dllimport)
+#      define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS _LIBCUDACXX_DLL_VIS
+#      define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
+#      define _LIBCUDACXX_OVERRIDABLE_FUNC_VIS
+#      define _LIBCUDACXX_EXPORTED_FROM_ABI __declspec(dllimport)
+#    endif
 
-#ifndef _LIBCUDACXX_HIDDEN
-#  if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#    define _LIBCUDACXX_HIDDEN __attribute__ ((__visibility__("hidden")))
-#  else
+#    define _LIBCUDACXX_TYPE_VIS      _LIBCUDACXX_DLL_VIS
+#    define _LIBCUDACXX_FUNC_VIS      _LIBCUDACXX_DLL_VIS
+#    define _LIBCUDACXX_EXCEPTION_ABI _LIBCUDACXX_DLL_VIS
 #    define _LIBCUDACXX_HIDDEN
+#    define _LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
+#    define _LIBCUDACXX_TEMPLATE_VIS
+#    define _LIBCUDACXX_ENUM_VIS
+
+#  endif // defined(_LIBCUDACXX_OBJECT_FORMAT_COFF)
+
+#  ifndef _LIBCUDACXX_HIDDEN
+#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
+#      define _LIBCUDACXX_HIDDEN __attribute__((__visibility__("hidden")))
+#    else
+#      define _LIBCUDACXX_HIDDEN
+#    endif
 #  endif
-#endif
 
-#ifndef _LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
-#  if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
+#  ifndef _LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
+#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
 // The inline should be removed once PR32114 is resolved
-#    define _LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS inline _LIBCUDACXX_HIDDEN
-#  else
-#    define _LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
+#      define _LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS inline _LIBCUDACXX_HIDDEN
+#    else
+#      define _LIBCUDACXX_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
+#    endif
 #  endif
-#endif
 
-#ifndef _LIBCUDACXX_FUNC_VIS
-#  if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#    define _LIBCUDACXX_FUNC_VIS _CCCL_VISIBILITY_DEFAULT
-#  else
-#    define _LIBCUDACXX_FUNC_VIS
+#  ifndef _LIBCUDACXX_FUNC_VIS
+#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
+#      define _LIBCUDACXX_FUNC_VIS _CCCL_VISIBILITY_DEFAULT
+#    else
+#      define _LIBCUDACXX_FUNC_VIS
+#    endif
 #  endif
-#endif
 
-#ifndef _LIBCUDACXX_TYPE_VIS
-#  if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#    define _LIBCUDACXX_TYPE_VIS _CCCL_VISIBILITY_DEFAULT
-#  else
-#    define _LIBCUDACXX_TYPE_VIS
+#  ifndef _LIBCUDACXX_TYPE_VIS
+#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
+#      define _LIBCUDACXX_TYPE_VIS _CCCL_VISIBILITY_DEFAULT
+#    else
+#      define _LIBCUDACXX_TYPE_VIS
+#    endif
 #  endif
-#endif
 
-#ifndef _LIBCUDACXX_TEMPLATE_VIS
-#  if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#    if __has_attribute(__type_visibility__)
-#      define _LIBCUDACXX_TEMPLATE_VIS _CCCL_TYPE_VISIBILITY_DEFAULT
+#  ifndef _LIBCUDACXX_TEMPLATE_VIS
+#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
+#      if __has_attribute(__type_visibility__)
+#        define _LIBCUDACXX_TEMPLATE_VIS _CCCL_TYPE_VISIBILITY_DEFAULT
+#      else
+#        define _LIBCUDACXX_TEMPLATE_VIS _CCCL_VISIBILITY_DEFAULT
+#      endif
 #    else
-#      define _LIBCUDACXX_TEMPLATE_VIS _CCCL_VISIBILITY_DEFAULT
+#      define _LIBCUDACXX_TEMPLATE_VIS
 #    endif
-#  else
-#    define _LIBCUDACXX_TEMPLATE_VIS
 #  endif
-#endif
 
-#ifndef _LIBCUDACXX_EXPORTED_FROM_ABI
-#  if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#    define _LIBCUDACXX_EXPORTED_FROM_ABI _CCCL_VISIBILITY_DEFAULT
-#  else
-#    define _LIBCUDACXX_EXPORTED_FROM_ABI
+#  ifndef _LIBCUDACXX_EXPORTED_FROM_ABI
+#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
+#      define _LIBCUDACXX_EXPORTED_FROM_ABI _CCCL_VISIBILITY_DEFAULT
+#    else
+#      define _LIBCUDACXX_EXPORTED_FROM_ABI
+#    endif
 #  endif
-#endif
-
-#ifndef _LIBCUDACXX_OVERRIDABLE_FUNC_VIS
-#define _LIBCUDACXX_OVERRIDABLE_FUNC_VIS _LIBCUDACXX_FUNC_VIS
-#endif
 
-#ifndef _LIBCUDACXX_EXCEPTION_ABI
-#  if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#    define _LIBCUDACXX_EXCEPTION_ABI _CCCL_VISIBILITY_DEFAULT
-#  else
-#    define _LIBCUDACXX_EXCEPTION_ABI
+#  ifndef _LIBCUDACXX_OVERRIDABLE_FUNC_VIS
+#    define _LIBCUDACXX_OVERRIDABLE_FUNC_VIS _LIBCUDACXX_FUNC_VIS
 #  endif
-#endif
 
-#ifndef _LIBCUDACXX_ENUM_VIS
-#  if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
-#    define _LIBCUDACXX_ENUM_VIS _CCCL_TYPE_VISIBILITY_DEFAULT
-#  else
-#    define _LIBCUDACXX_ENUM_VIS
+#  ifndef _LIBCUDACXX_EXCEPTION_ABI
+#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
+#      define _LIBCUDACXX_EXCEPTION_ABI _CCCL_VISIBILITY_DEFAULT
+#    else
+#      define _LIBCUDACXX_EXCEPTION_ABI
+#    endif
 #  endif
-#endif
 
-#ifndef _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS
-#  if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS) && __has_attribute(__type_visibility__)
-#    define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS _CCCL_VISIBILITY_DEFAULT
-#  else
-#    define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS
+#  ifndef _LIBCUDACXX_ENUM_VIS
+#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS)
+#      define _LIBCUDACXX_ENUM_VIS _CCCL_TYPE_VISIBILITY_DEFAULT
+#    else
+#      define _LIBCUDACXX_ENUM_VIS
+#    endif
 #  endif
-#endif
 
-#ifndef _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
-#define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
-#endif
+#  ifndef _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS
+#    if !defined(_LIBCUDACXX_DISABLE_VISIBILITY_ANNOTATIONS) && __has_attribute(__type_visibility__)
+#      define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS _CCCL_VISIBILITY_DEFAULT
+#    else
+#      define _LIBCUDACXX_EXTERN_TEMPLATE_TYPE_VIS
+#    endif
+#  endif
 
-#if __has_attribute(internal_linkage)
-#  define _LIBCUDACXX_INTERNAL_LINKAGE __attribute__ ((internal_linkage))
-#else
-#  define _LIBCUDACXX_INTERNAL_LINKAGE _LIBCUDACXX_ALWAYS_INLINE
-#endif
+#  ifndef _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
+#    define _LIBCUDACXX_CLASS_TEMPLATE_INSTANTIATION_VIS
+#  endif
 
-#if __has_attribute(exclude_from_explicit_instantiation)
-#  define _LIBCUDACXX_EXCLUDE_FROM_EXPLICIT_INSTANTIATION __attribute__ ((__exclude_from_explicit_instantiation__))
-#else
-   // Try to approximate the effect of exclude_from_explicit_instantiation
-   // (which is that entities are not assumed to be provided by explicit
-   // template instantiations in the dylib) by always inlining those entities.
-#  define _LIBCUDACXX_EXCLUDE_FROM_EXPLICIT_INSTANTIATION _LIBCUDACXX_ALWAYS_INLINE
-#endif
+#  if __has_attribute(internal_linkage)
+#    define _LIBCUDACXX_INTERNAL_LINKAGE __attribute__((internal_linkage))
+#  else
+#    define _LIBCUDACXX_INTERNAL_LINKAGE _LIBCUDACXX_ALWAYS_INLINE
+#  endif
 
-#ifndef _LIBCUDACXX_HIDE_FROM_ABI_PER_TU
-#  ifndef _LIBCUDACXX_HIDE_FROM_ABI_PER_TU_BY_DEFAULT
-#    define _LIBCUDACXX_HIDE_FROM_ABI_PER_TU 0
+#  if __has_attribute(exclude_from_explicit_instantiation)
+#    define _LIBCUDACXX_EXCLUDE_FROM_EXPLICIT_INSTANTIATION __attribute__((__exclude_from_explicit_instantiation__))
 #  else
-#    define _LIBCUDACXX_HIDE_FROM_ABI_PER_TU 1
+// Try to approximate the effect of exclude_from_explicit_instantiation
+// (which is that entities are not assumed to be provided by explicit
+// template instantiations in the dylib) by always inlining those entities.
+#    define _LIBCUDACXX_EXCLUDE_FROM_EXPLICIT_INSTANTIATION _LIBCUDACXX_ALWAYS_INLINE
 #  endif
-#endif
 
-#ifndef _LIBCUDACXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT
-# ifdef _LIBCUDACXX_OBJECT_FORMAT_COFF // Windows binaries can't merge typeinfos.
-# define _LIBCUDACXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT 0
-#else
+#  ifndef _LIBCUDACXX_HIDE_FROM_ABI_PER_TU
+#    ifndef _LIBCUDACXX_HIDE_FROM_ABI_PER_TU_BY_DEFAULT
+#      define _LIBCUDACXX_HIDE_FROM_ABI_PER_TU 0
+#    else
+#      define _LIBCUDACXX_HIDE_FROM_ABI_PER_TU 1
+#    endif
+#  endif
+
+#  ifndef _LIBCUDACXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT
+#    ifdef _LIBCUDACXX_OBJECT_FORMAT_COFF // Windows binaries can't merge typeinfos.
+#      define _LIBCUDACXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT 0
+#    else
 // TODO: This isn't strictly correct on ELF platforms due to llvm.org/PR37398
 // And we should consider defaulting to OFF.
-# define _LIBCUDACXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT 1
-#endif
-#endif
+#      define _LIBCUDACXX_HAS_MERGED_TYPEINFO_NAMES_DEFAULT 1
+#    endif
+#  endif
 
-#ifndef _LIBCUDACXX_HIDE_FROM_ABI
-#  if _LIBCUDACXX_HIDE_FROM_ABI_PER_TU
-#    define _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_HIDDEN _LIBCUDACXX_INTERNAL_LINKAGE
-#  else
-#    define _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_HIDDEN _LIBCUDACXX_EXCLUDE_FROM_EXPLICIT_INSTANTIATION
+#  ifndef _LIBCUDACXX_HIDE_FROM_ABI
+#    if _LIBCUDACXX_HIDE_FROM_ABI_PER_TU
+#      define _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_HIDDEN _LIBCUDACXX_INTERNAL_LINKAGE
+#    else
+#      define _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_HIDDEN _LIBCUDACXX_EXCLUDE_FROM_EXPLICIT_INSTANTIATION
+#    endif
 #  endif
-#endif
 
-#ifdef _LIBCUDACXX_BUILDING_LIBRARY
-#  if _LIBCUDACXX_ABI_VERSION > 1
-#    define _LIBCUDACXX_HIDE_FROM_ABI_AFTER_V1 _LIBCUDACXX_HIDE_FROM_ABI
+#  ifdef _LIBCUDACXX_BUILDING_LIBRARY
+#    if _LIBCUDACXX_ABI_VERSION > 1
+#      define _LIBCUDACXX_HIDE_FROM_ABI_AFTER_V1 _LIBCUDACXX_HIDE_FROM_ABI
+#    else
+#      define _LIBCUDACXX_HIDE_FROM_ABI_AFTER_V1
+#    endif
 #  else
-#    define _LIBCUDACXX_HIDE_FROM_ABI_AFTER_V1
+#    define _LIBCUDACXX_HIDE_FROM_ABI_AFTER_V1 _LIBCUDACXX_HIDE_FROM_ABI
 #  endif
-#else
-#  define _LIBCUDACXX_HIDE_FROM_ABI_AFTER_V1 _LIBCUDACXX_HIDE_FROM_ABI
-#endif
 
 // Just so we can migrate to the new macros gradually.
 
-#ifdef __cuda_std__
-#  define _LIBCUDACXX_INLINE_VISIBILITY _CCCL_HOST_DEVICE
-#else
-#  define _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_HIDE_FROM_ABI
-#endif // __cuda_std__
-
-#define _LIBCUDACXX_CONCAT1(_LIBCUDACXX_X,_LIBCUDACXX_Y) _LIBCUDACXX_X##_LIBCUDACXX_Y
-#define _LIBCUDACXX_CONCAT(_LIBCUDACXX_X,_LIBCUDACXX_Y) _LIBCUDACXX_CONCAT1(_LIBCUDACXX_X,_LIBCUDACXX_Y)
-
-#ifndef _LIBCUDACXX_ABI_NAMESPACE
-#ifdef __cuda_std__
-# define _LIBCUDACXX_ABI_NAMESPACE _LIBCUDACXX_CONCAT(__,_LIBCUDACXX_CUDA_ABI_VERSION)
-#else
-# define _LIBCUDACXX_ABI_NAMESPACE _LIBCUDACXX_CONCAT(__,_LIBCUDACXX_ABI_VERSION)
-#endif // __cuda_std__
-#endif // _LIBCUDACXX_ABI_NAMESPACE
-
-#ifdef __cuda_std__
-#  define _LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION namespace cuda { namespace std {
-#  define _LIBCUDACXX_END_NAMESPACE_STD_NOVERSION } }
-#  define _CUDA_VSTD_NOVERSION ::cuda::std
-#  define _CUDA_VSTD           ::cuda::std::_LIBCUDACXX_ABI_NAMESPACE
-#  define _CUDA_VRANGES        ::cuda::std::ranges::_LIBCUDACXX_ABI_NAMESPACE
-#  define _CUDA_VIEWS          ::cuda::std::ranges::views::_LIBCUDACXX_CUDA_ABI_NAMESPACE
-#  define _CUDA_VMR            ::cuda::mr::_LIBCUDACXX_ABI_NAMESPACE
-#  define _CUDA_VPTX           ::cuda::ptx::_LIBCUDACXX_ABI_NAMESPACE
-#else
-#  define _LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION namespace std {
-#  define _LIBCUDACXX_END_NAMESPACE_STD_NOVERSION }
-#  define _CUDA_VSTD_NOVERSION ::std
-#  define _CUDA_VSTD           ::std::_LIBCUDACXX_ABI_NAMESPACE
-#  define _CUDA_VRANGES        ::std::ranges::_LIBCUDACXX_ABI_NAMESPACE
-#  define _CUDA_VIEWS          ::std::ranges::views::_LIBCUDACXX_CUDA_ABI_NAMESPACE
-#endif
-
-#ifdef __cuda_std__
-#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA namespace cuda { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
-#define _LIBCUDACXX_END_NAMESPACE_CUDA  } }
-#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR namespace cuda { namespace mr { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
-#define _LIBCUDACXX_END_NAMESPACE_CUDA_MR  } } }
-#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE namespace cuda { namespace device { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
-#define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE  } } }
-#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX namespace cuda { namespace ptx { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
-#define _LIBCUDACXX_END_NAMESPACE_CUDA_PTX  } } }
-#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL namespace cuda { namespace device { namespace experimental { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
-#define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL  } } } }
-#endif
+#  ifdef __cuda_std__
+#    define _LIBCUDACXX_INLINE_VISIBILITY _CCCL_HOST_DEVICE
+#  else
+#    define _LIBCUDACXX_INLINE_VISIBILITY _LIBCUDACXX_HIDE_FROM_ABI
+#  endif // __cuda_std__
 
-// Inline namespaces are available in Clang/GCC/MSVC regardless of C++ dialect.
-#define _LIBCUDACXX_BEGIN_NAMESPACE_STD _LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION inline namespace _LIBCUDACXX_ABI_NAMESPACE {
-#define _LIBCUDACXX_END_NAMESPACE_STD   } _LIBCUDACXX_END_NAMESPACE_STD_NOVERSION
+#  define _LIBCUDACXX_CONCAT1(_LIBCUDACXX_X, _LIBCUDACXX_Y) _LIBCUDACXX_X##_LIBCUDACXX_Y
+#  define _LIBCUDACXX_CONCAT(_LIBCUDACXX_X, _LIBCUDACXX_Y)  _LIBCUDACXX_CONCAT1(_LIBCUDACXX_X, _LIBCUDACXX_Y)
 
-#ifndef __cuda_std__
-_LIBCUDACXX_BEGIN_NAMESPACE_STD _LIBCUDACXX_END_NAMESPACE_STD
-#endif
+#  ifndef _LIBCUDACXX_ABI_NAMESPACE
+#    ifdef __cuda_std__
+#      define _LIBCUDACXX_ABI_NAMESPACE _LIBCUDACXX_CONCAT(__, _LIBCUDACXX_CUDA_ABI_VERSION)
+#    else
+#      define _LIBCUDACXX_ABI_NAMESPACE _LIBCUDACXX_CONCAT(__, _LIBCUDACXX_ABI_VERSION)
+#    endif // __cuda_std__
+#  endif // _LIBCUDACXX_ABI_NAMESPACE
 
-#define _LIBCUDACXX_BEGIN_NAMESPACE_RANGES _LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION namespace ranges { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
-#define _LIBCUDACXX_END_NAMESPACE_RANGES   } } _LIBCUDACXX_END_NAMESPACE_STD_NOVERSION
+#  ifdef __cuda_std__
+#    define _LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION \
+      namespace cuda                                  \
+      {                                               \
+      namespace std                                   \
+      {
+#    define _LIBCUDACXX_END_NAMESPACE_STD_NOVERSION \
+      }                                             \
+      }
+#    define _CUDA_VSTD_NOVERSION ::cuda::std
+#    define _CUDA_VSTD           ::cuda::std::_LIBCUDACXX_ABI_NAMESPACE
+#    define _CUDA_VRANGES        ::cuda::std::ranges::_LIBCUDACXX_ABI_NAMESPACE
+#    define _CUDA_VIEWS          ::cuda::std::ranges::views::_LIBCUDACXX_CUDA_ABI_NAMESPACE
+#    define _CUDA_VMR            ::cuda::mr::_LIBCUDACXX_ABI_NAMESPACE
+#    define _CUDA_VPTX           ::cuda::ptx::_LIBCUDACXX_ABI_NAMESPACE
+#  else
+#    define _LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION \
+      namespace std                                   \
+      {
+#    define _LIBCUDACXX_END_NAMESPACE_STD_NOVERSION }
+#    define _CUDA_VSTD_NOVERSION                    ::std
+#    define _CUDA_VSTD                              ::std::_LIBCUDACXX_ABI_NAMESPACE
+#    define _CUDA_VRANGES                           ::std::ranges::_LIBCUDACXX_ABI_NAMESPACE
+#    define _CUDA_VIEWS                             ::std::ranges::views::_LIBCUDACXX_CUDA_ABI_NAMESPACE
+#  endif
 
-#if !defined(__cuda_std__)
-_LIBCUDACXX_BEGIN_NAMESPACE_RANGES _LIBCUDACXX_END_NAMESPACE_RANGES
-#endif
+#  ifdef __cuda_std__
+#    define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA     \
+      namespace cuda                             \
+      {                                          \
+      inline namespace _LIBCUDACXX_ABI_NAMESPACE \
+      {
+#    define _LIBCUDACXX_END_NAMESPACE_CUDA \
+      }                                    \
+      }
+#    define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_MR  \
+      namespace cuda                             \
+      {                                          \
+      namespace mr                               \
+      {                                          \
+      inline namespace _LIBCUDACXX_ABI_NAMESPACE \
+      {
+#    define _LIBCUDACXX_END_NAMESPACE_CUDA_MR \
+      }                                       \
+      }                                       \
+      }
+#    define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE \
+      namespace cuda                                \
+      {                                             \
+      namespace device                              \
+      {                                             \
+      inline namespace _LIBCUDACXX_ABI_NAMESPACE    \
+      {
+#    define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE \
+      }                                           \
+      }                                           \
+      }
+#    define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX \
+      namespace cuda                             \
+      {                                          \
+      namespace ptx                              \
+      {                                          \
+      inline namespace _LIBCUDACXX_ABI_NAMESPACE \
+      {
+#    define _LIBCUDACXX_END_NAMESPACE_CUDA_PTX \
+      }                                        \
+      }                                        \
+      }
+#    define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL \
+      namespace cuda                                             \
+      {                                                          \
+      namespace device                                           \
+      {                                                          \
+      namespace experimental                                     \
+      {                                                          \
+      inline namespace _LIBCUDACXX_ABI_NAMESPACE                 \
+      {
+#    define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL \
+      }                                                        \
+      }                                                        \
+      }                                                        \
+      }
+#  endif
 
-#define _LIBCUDACXX_BEGIN_NAMESPACE_VIEWS _LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION namespace ranges { namespace views { inline namespace _LIBCUDACXX_CUDA_ABI_NAMESPACE {
-#define _LIBCUDACXX_END_NAMESPACE_VIEWS } } } _LIBCUDACXX_END_NAMESPACE_STD_NOVERSION
-#if !defined(__cuda_std__)
-_LIBCUDACXX_BEGIN_NAMESPACE_VIEWS _LIBCUDACXX_END_NAMESPACE_VIEWS
-#endif
+// Inline namespaces are available in Clang/GCC/MSVC regardless of C++ dialect.
+#  define _LIBCUDACXX_BEGIN_NAMESPACE_STD      \
+    _LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION  \
+    inline namespace _LIBCUDACXX_ABI_NAMESPACE \
+    {
+#  define _LIBCUDACXX_END_NAMESPACE_STD \
+    }                                   \
+    _LIBCUDACXX_END_NAMESPACE_STD_NOVERSION
+
+#  ifndef __cuda_std__
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+_LIBCUDACXX_END_NAMESPACE_STD
+#  endif
 
-#if _CCCL_STD_VER > 2017
-#define _LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI inline namespace __cxx20 {
-#else
-#define _LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI inline namespace __cxx17 {
-#endif
-#define _LIBCUDACXX_END_NAMESPACE_RANGES_ABI }
+#  define _LIBCUDACXX_BEGIN_NAMESPACE_RANGES   \
+    _LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION  \
+    namespace ranges                           \
+    {                                          \
+    inline namespace _LIBCUDACXX_ABI_NAMESPACE \
+    {
+#  define _LIBCUDACXX_END_NAMESPACE_RANGES \
+    }                                      \
+    }                                      \
+    _LIBCUDACXX_END_NAMESPACE_STD_NOVERSION
+
+#  if !defined(__cuda_std__)
+_LIBCUDACXX_BEGIN_NAMESPACE_RANGES
+_LIBCUDACXX_END_NAMESPACE_RANGES
+#  endif
 
-#define _LIBCUDACXX_BEGIN_NAMESPACE_CPO(_CPO) namespace _CPO { _LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI
-#define _LIBCUDACXX_END_NAMESPACE_CPO } }
+#  define _LIBCUDACXX_BEGIN_NAMESPACE_VIEWS         \
+    _LIBCUDACXX_BEGIN_NAMESPACE_STD_NOVERSION       \
+    namespace ranges                                \
+    {                                               \
+    namespace views                                 \
+    {                                               \
+    inline namespace _LIBCUDACXX_CUDA_ABI_NAMESPACE \
+    {
+#  define _LIBCUDACXX_END_NAMESPACE_VIEWS \
+    }                                     \
+    }                                     \
+    }                                     \
+    _LIBCUDACXX_END_NAMESPACE_STD_NOVERSION
+#  if !defined(__cuda_std__)
+_LIBCUDACXX_BEGIN_NAMESPACE_VIEWS
+_LIBCUDACXX_END_NAMESPACE_VIEWS
+#  endif
 
-#if _CCCL_STD_VER >= 2017
-#define _LIBCUDACXX_BEGIN_NAMESPACE_FILESYSTEM \
-  _LIBCUDACXX_BEGIN_NAMESPACE_STD inline namespace __fs { namespace filesystem {
-#else
-#define _LIBCUDACXX_BEGIN_NAMESPACE_FILESYSTEM \
-  _LIBCUDACXX_BEGIN_NAMESPACE_STD namespace __fs { namespace filesystem {
-#endif
+#  if _CCCL_STD_VER > 2017
+#    define _LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI \
+      inline namespace __cxx20                     \
+      {
+#  else
+#    define _LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI \
+      inline namespace __cxx17                     \
+      {
+#  endif
+#  define _LIBCUDACXX_END_NAMESPACE_RANGES_ABI }
+
+#  define _LIBCUDACXX_BEGIN_NAMESPACE_CPO(_CPO) \
+    namespace _CPO                              \
+    {                                           \
+    _LIBCUDACXX_BEGIN_NAMESPACE_RANGES_ABI
+#  define _LIBCUDACXX_END_NAMESPACE_CPO \
+    }                                   \
+    }
+
+#  if _CCCL_STD_VER >= 2017
+#    define _LIBCUDACXX_BEGIN_NAMESPACE_FILESYSTEM \
+      _LIBCUDACXX_BEGIN_NAMESPACE_STD              \
+      inline namespace __fs                        \
+      {                                            \
+      namespace filesystem                         \
+      {
+#  else
+#    define _LIBCUDACXX_BEGIN_NAMESPACE_FILESYSTEM \
+      _LIBCUDACXX_BEGIN_NAMESPACE_STD              \
+      namespace __fs                               \
+      {                                            \
+      namespace filesystem                         \
+      {
+#  endif
 
-#define _LIBCUDACXX_END_NAMESPACE_FILESYSTEM \
-  _LIBCUDACXX_END_NAMESPACE_STD } }
+#  define _LIBCUDACXX_END_NAMESPACE_FILESYSTEM \
+    _LIBCUDACXX_END_NAMESPACE_STD              \
+    }                                          \
+    }
 
-#define _CUDA_VSTD_FS _CUDA_VSTD::__fs::filesystem
+#  define _CUDA_VSTD_FS _CUDA_VSTD::__fs::filesystem
 
-#ifndef _LIBCUDACXX_PREFERRED_OVERLOAD
-#  if __has_attribute(__enable_if__)
-#    define _LIBCUDACXX_PREFERRED_OVERLOAD __attribute__ ((__enable_if__(true, "")))
+#  ifndef _LIBCUDACXX_PREFERRED_OVERLOAD
+#    if __has_attribute(__enable_if__)
+#      define _LIBCUDACXX_PREFERRED_OVERLOAD __attribute__((__enable_if__(true, "")))
+#    endif
 #  endif
-#endif
 
-#ifdef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
+#  ifdef _LIBCUDACXX_HAS_NO_UNICODE_CHARS
 typedef unsigned short char16_t;
-typedef unsigned int   char32_t;
-#endif  // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
-
-#if defined(_CCCL_COMPILER_GCC)   \
- || defined(_CCCL_COMPILER_CLANG)
-#  define _LIBCUDACXX_NOALIAS __attribute__((__malloc__))
-#else
-#  define _LIBCUDACXX_NOALIAS
-#endif
+typedef unsigned int char32_t;
+#  endif // _LIBCUDACXX_HAS_NO_UNICODE_CHARS
 
-#if __has_feature(cxx_explicit_conversions) \
- || defined(_CCCL_COMPILER_IBM)       \
- || defined(_CCCL_COMPILER_GCC)       \
- || defined(_CCCL_COMPILER_CLANG)
-#  define _LIBCUDACXX_EXPLICIT explicit
-#else
-#  define _LIBCUDACXX_EXPLICIT
-#endif
-
-#if !__has_builtin(__builtin_operator_new) || !__has_builtin(__builtin_operator_delete)
-#define _LIBCUDACXX_HAS_NO_BUILTIN_OPERATOR_NEW_DELETE
-#endif
+#  if defined(_CCCL_COMPILER_GCC) || defined(_CCCL_COMPILER_CLANG)
+#    define _LIBCUDACXX_NOALIAS __attribute__((__malloc__))
+#  else
+#    define _LIBCUDACXX_NOALIAS
+#  endif
 
-#ifdef _LIBCUDACXX_HAS_NO_STRONG_ENUMS
-#  define _LIBCUDACXX_DECLARE_STRONG_ENUM(x) struct _LIBCUDACXX_TYPE_VIS x { enum __lx
-#  define _LIBCUDACXX_DECLARE_STRONG_ENUM_EPILOG(x) \
-     __lx __v_; \
-     _LIBCUDACXX_INLINE_VISIBILITY x(__lx __v) : __v_(__v) {} \
-     _LIBCUDACXX_INLINE_VISIBILITY explicit x(int __v) : __v_(static_cast<__lx>(__v)) {} \
-     _LIBCUDACXX_INLINE_VISIBILITY operator int() const {return __v_;} \
-     };
-#else  // _LIBCUDACXX_HAS_NO_STRONG_ENUMS
-#  define _LIBCUDACXX_DECLARE_STRONG_ENUM(x) enum class _LIBCUDACXX_ENUM_VIS x
-#  define _LIBCUDACXX_DECLARE_STRONG_ENUM_EPILOG(x)
-#endif  // _LIBCUDACXX_HAS_NO_STRONG_ENUMS
-
-#ifdef _LIBCUDACXX_DEBUG
-#  if _LIBCUDACXX_DEBUG == 0
-#    define _LIBCUDACXX_DEBUG_LEVEL 1
-#  elif _LIBCUDACXX_DEBUG == 1
-#    define _LIBCUDACXX_DEBUG_LEVEL 2
+#  if __has_feature(cxx_explicit_conversions) || defined(_CCCL_COMPILER_IBM) || defined(_CCCL_COMPILER_GCC) \
+    || defined(_CCCL_COMPILER_CLANG)
+#    define _LIBCUDACXX_EXPLICIT explicit
 #  else
-#    error Supported values for _LIBCUDACXX_DEBUG are 0 and 1
+#    define _LIBCUDACXX_EXPLICIT
 #  endif
-#  if !defined(_LIBCUDACXX_BUILDING_LIBRARY)
-#    define _LIBCUDACXX_EXTERN_TEMPLATE(...)
+
+#  if !__has_builtin(__builtin_operator_new) || !__has_builtin(__builtin_operator_delete)
+#    define _LIBCUDACXX_HAS_NO_BUILTIN_OPERATOR_NEW_DELETE
 #  endif
-#endif
 
-#ifdef _LIBCUDACXX_DISABLE_EXTERN_TEMPLATE
-#define _LIBCUDACXX_EXTERN_TEMPLATE(...)
-#define _LIBCUDACXX_EXTERN_TEMPLATE2(...)
-#endif
+#  ifdef _LIBCUDACXX_HAS_NO_STRONG_ENUMS
+#    define _LIBCUDACXX_DECLARE_STRONG_ENUM(x) \
+      struct _LIBCUDACXX_TYPE_VIS x            \
+      {                                        \
+        enum __lx
+#    define _LIBCUDACXX_DECLARE_STRONG_ENUM_EPILOG(x)    \
+      __lx __v_;                                         \
+      _LIBCUDACXX_INLINE_VISIBILITY x(__lx __v)          \
+          : __v_(__v)                                    \
+      {}                                                 \
+      _LIBCUDACXX_INLINE_VISIBILITY explicit x(int __v)  \
+          : __v_(static_cast<__lx>(__v))                 \
+      {}                                                 \
+      _LIBCUDACXX_INLINE_VISIBILITY operator int() const \
+      {                                                  \
+        return __v_;                                     \
+      }                                                  \
+      }                                                  \
+      ;
+#  else // _LIBCUDACXX_HAS_NO_STRONG_ENUMS
+#    define _LIBCUDACXX_DECLARE_STRONG_ENUM(x) enum class _LIBCUDACXX_ENUM_VIS x
+#    define _LIBCUDACXX_DECLARE_STRONG_ENUM_EPILOG(x)
+#  endif // _LIBCUDACXX_HAS_NO_STRONG_ENUMS
+
+#  ifdef _LIBCUDACXX_DEBUG
+#    if _LIBCUDACXX_DEBUG == 0
+#      define _LIBCUDACXX_DEBUG_LEVEL 1
+#    elif _LIBCUDACXX_DEBUG == 1
+#      define _LIBCUDACXX_DEBUG_LEVEL 2
+#    else
+#      error Supported values for _LIBCUDACXX_DEBUG are 0 and 1
+#    endif
+#    if !defined(_LIBCUDACXX_BUILDING_LIBRARY)
+#      define _LIBCUDACXX_EXTERN_TEMPLATE(...)
+#    endif
+#  endif
 
-#ifndef _LIBCUDACXX_EXTERN_TEMPLATE
-#define _LIBCUDACXX_EXTERN_TEMPLATE(...) extern template __VA_ARGS__;
-#endif
+#  ifdef _LIBCUDACXX_DISABLE_EXTERN_TEMPLATE
+#    define _LIBCUDACXX_EXTERN_TEMPLATE(...)
+#    define _LIBCUDACXX_EXTERN_TEMPLATE2(...)
+#  endif
 
-#ifndef _LIBCUDACXX_EXTERN_TEMPLATE2
-#define _LIBCUDACXX_EXTERN_TEMPLATE2(...) extern template __VA_ARGS__;
-#endif
+#  ifndef _LIBCUDACXX_EXTERN_TEMPLATE
+#    define _LIBCUDACXX_EXTERN_TEMPLATE(...) extern template __VA_ARGS__;
+#  endif
 
-#if defined(__APPLE__) || defined(__FreeBSD__) || defined(_LIBCUDACXX_MSVCRT_LIKE) || \
-    defined(__sun__) || defined(__NetBSD__) || defined(__CloudABI__)
-#define _LIBCUDACXX_LOCALE__L_EXTENSIONS 1
-#endif
+#  ifndef _LIBCUDACXX_EXTERN_TEMPLATE2
+#    define _LIBCUDACXX_EXTERN_TEMPLATE2(...) extern template __VA_ARGS__;
+#  endif
+
+#  if defined(__APPLE__) || defined(__FreeBSD__) || defined(_LIBCUDACXX_MSVCRT_LIKE) || defined(__sun__) \
+    || defined(__NetBSD__) || defined(__CloudABI__)
+#    define _LIBCUDACXX_LOCALE__L_EXTENSIONS 1
+#  endif
 
-#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
+#  if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))
 // Most unix variants have catopen.  These are the specific ones that don't.
-#  if !defined(__BIONIC__) && !defined(_NEWLIB_VERSION)
-#    define _LIBCUDACXX_HAS_CATOPEN 1
+#    if !defined(__BIONIC__) && !defined(_NEWLIB_VERSION)
+#      define _LIBCUDACXX_HAS_CATOPEN 1
+#    endif
 #  endif
-#endif
 
-#ifdef __FreeBSD__
-#define _DECLARE_C99_LDBL_MATH 1
-#endif
+#  ifdef __FreeBSD__
+#    define _DECLARE_C99_LDBL_MATH 1
+#  endif
 
-#if defined(_LIBCUDACXX_ABI_MICROSOFT) && !defined(_LIBCUDACXX_NO_VCRUNTIME)
-#  define _LIBCUDACXX_DEFER_NEW_TO_VCRUNTIME
-#endif
+#  if defined(_LIBCUDACXX_ABI_MICROSOFT) && !defined(_LIBCUDACXX_NO_VCRUNTIME)
+#    define _LIBCUDACXX_DEFER_NEW_TO_VCRUNTIME
+#  endif
 
 // If we are getting operator new from the MSVC CRT, then allocation overloads
 // for align_val_t were added in 19.12, aka VS 2017 version 15.3.
-#if defined(_LIBCUDACXX_MSVCRT) && defined(_CCCL_COMPILER_MSVC) && _MSC_VER < 1912
-#  define _LIBCUDACXX_HAS_NO_LIBRARY_ALIGNED_ALLOCATION
-#elif defined(_LIBCUDACXX_ABI_VCRUNTIME) && !defined(__cpp_aligned_new)
-   // We're deferring to Microsoft's STL to provide aligned new et al. We don't
-   // have it unless the language feature test macro is defined.
-#  define _LIBCUDACXX_HAS_NO_LIBRARY_ALIGNED_ALLOCATION
-#endif
-
-#if defined(__APPLE__)
-#  if !defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && \
-      defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)
-#    define __MAC_OS_X_VERSION_MIN_REQUIRED __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__
+#  if defined(_LIBCUDACXX_MSVCRT) && defined(_CCCL_COMPILER_MSVC) && _MSC_VER < 1912
+#    define _LIBCUDACXX_HAS_NO_LIBRARY_ALIGNED_ALLOCATION
+#  elif defined(_LIBCUDACXX_ABI_VCRUNTIME) && !defined(__cpp_aligned_new)
+// We're deferring to Microsoft's STL to provide aligned new et al. We don't
+// have it unless the language feature test macro is defined.
+#    define _LIBCUDACXX_HAS_NO_LIBRARY_ALIGNED_ALLOCATION
 #  endif
-#endif // defined(__APPLE__)
+
+#  if defined(__APPLE__)
+#    if !defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)
+#      define __MAC_OS_X_VERSION_MIN_REQUIRED __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__
+#    endif
+#  endif // defined(__APPLE__)
 
 #  if !defined(_LIBCUDACXX_HAS_NO_ALIGNED_ALLOCATION)                     \
       && (defined(_LIBCUDACXX_HAS_NO_LIBRARY_ALIGNED_ALLOCATION)          \
@@ -1574,151 +1590,138 @@ typedef unsigned int   char32_t;
 // Deprecations warnings are always enabled, except when users explicitly opt-out
 // by defining _LIBCUDACXX_DISABLE_DEPRECATION_WARNINGS.
 // NVCC 11.1 and 11.2 are broken with the deprecated attribute, so disable it
-#if !defined(_LIBCUDACXX_DISABLE_DEPRECATION_WARNINGS) \
- && !defined(_CCCL_CUDACC_BELOW_11_3)
-#  if __has_attribute(deprecated)
-#    define _LIBCUDACXX_DEPRECATED __attribute__ ((deprecated))
-#  elif _CCCL_STD_VER > 2011
-#    define _LIBCUDACXX_DEPRECATED [[deprecated]]
+#  if !defined(_LIBCUDACXX_DISABLE_DEPRECATION_WARNINGS) && !defined(_CCCL_CUDACC_BELOW_11_3)
+#    if __has_attribute(deprecated)
+#      define _LIBCUDACXX_DEPRECATED __attribute__((deprecated))
+#    elif _CCCL_STD_VER > 2011
+#      define _LIBCUDACXX_DEPRECATED [[deprecated]]
+#    else
+#      define _LIBCUDACXX_DEPRECATED
+#    endif
 #  else
 #    define _LIBCUDACXX_DEPRECATED
 #  endif
-#else
-#  define _LIBCUDACXX_DEPRECATED
-#endif
 
-#define _LIBCUDACXX_DEPRECATED_IN_CXX11 _LIBCUDACXX_DEPRECATED
+#  define _LIBCUDACXX_DEPRECATED_IN_CXX11 _LIBCUDACXX_DEPRECATED
 
-#if _CCCL_STD_VER >= 2014
-#  define _LIBCUDACXX_DEPRECATED_IN_CXX14 _LIBCUDACXX_DEPRECATED
-#else
-#  define _LIBCUDACXX_DEPRECATED_IN_CXX14
-#endif
+#  if _CCCL_STD_VER >= 2014
+#    define _LIBCUDACXX_DEPRECATED_IN_CXX14 _LIBCUDACXX_DEPRECATED
+#  else
+#    define _LIBCUDACXX_DEPRECATED_IN_CXX14
+#  endif
 
-#if _CCCL_STD_VER >= 2017
-#  define _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_DEPRECATED
-#else
-#  define _LIBCUDACXX_DEPRECATED_IN_CXX17
-#endif
+#  if _CCCL_STD_VER >= 2017
+#    define _LIBCUDACXX_DEPRECATED_IN_CXX17 _LIBCUDACXX_DEPRECATED
+#  else
+#    define _LIBCUDACXX_DEPRECATED_IN_CXX17
+#  endif
 
-#if _CCCL_STD_VER >= 2020
-#  define _LIBCUDACXX_DEPRECATED_IN_CXX20 _LIBCUDACXX_DEPRECATED
-#else
-#  define _LIBCUDACXX_DEPRECATED_IN_CXX20
-#endif
+#  if _CCCL_STD_VER >= 2020
+#    define _LIBCUDACXX_DEPRECATED_IN_CXX20 _LIBCUDACXX_DEPRECATED
+#  else
+#    define _LIBCUDACXX_DEPRECATED_IN_CXX20
+#  endif
 
-#if _CCCL_STD_VER <= 2011
-#  define _LIBCUDACXX_EXPLICIT_AFTER_CXX11
-#else
-#  define _LIBCUDACXX_EXPLICIT_AFTER_CXX11 explicit
-#endif
+#  if _CCCL_STD_VER <= 2011
+#    define _LIBCUDACXX_EXPLICIT_AFTER_CXX11
+#  else
+#    define _LIBCUDACXX_EXPLICIT_AFTER_CXX11 explicit
+#  endif
 
-#if _CCCL_STD_VER > 2014 && defined(__cpp_inline_variables) && (__cpp_inline_variables >= 201606L)
-#  define _LIBCUDACXX_INLINE_VAR inline
-#else
-#  define _LIBCUDACXX_INLINE_VAR
-#endif
+#  if _CCCL_STD_VER > 2014 && defined(__cpp_inline_variables) && (__cpp_inline_variables >= 201606L)
+#    define _LIBCUDACXX_INLINE_VAR inline
+#  else
+#    define _LIBCUDACXX_INLINE_VAR
+#  endif
 
-#ifdef _LIBCUDACXX_HAS_NO_RVALUE_REFERENCES
-#  define _LIBCUDACXX_EXPLICIT_MOVE(x) _CUDA_VSTD::move(x)
-#else
-#  define _LIBCUDACXX_EXPLICIT_MOVE(x) (x)
-#endif
+#  ifdef _LIBCUDACXX_HAS_NO_RVALUE_REFERENCES
+#    define _LIBCUDACXX_EXPLICIT_MOVE(x) _CUDA_VSTD::move(x)
+#  else
+#    define _LIBCUDACXX_EXPLICIT_MOVE(x) (x)
+#  endif
 
-#if __has_attribute(no_destroy)
-#  define _LIBCUDACXX_NO_DESTROY __attribute__((__no_destroy__))
-#else
-#  define _LIBCUDACXX_NO_DESTROY
-#endif
+#  if __has_attribute(no_destroy)
+#    define _LIBCUDACXX_NO_DESTROY __attribute__((__no_destroy__))
+#  else
+#    define _LIBCUDACXX_NO_DESTROY
+#  endif
 
-#ifndef _LIBCUDACXX_HAS_NO_ASAN
-extern "C" _LIBCUDACXX_FUNC_VIS void __sanitizer_annotate_contiguous_container(
-  const void *, const void *, const void *, const void *);
-#endif
+#  ifndef _LIBCUDACXX_HAS_NO_ASAN
+extern "C" _LIBCUDACXX_FUNC_VIS void
+__sanitizer_annotate_contiguous_container(const void*, const void*, const void*, const void*);
+#  endif
 
-#ifndef _LIBCUDACXX_WEAK
-#define _LIBCUDACXX_WEAK __attribute__((__weak__))
-#endif
+#  ifndef _LIBCUDACXX_WEAK
+#    define _LIBCUDACXX_WEAK __attribute__((__weak__))
+#  endif
 
 // Redefine some macros for internal use
-#if defined(__cuda_std__)
-#  undef _LIBCUDACXX_FUNC_VIS
-#  define _LIBCUDACXX_FUNC_VIS _LIBCUDACXX_INLINE_VISIBILITY
-#  undef _LIBCUDACXX_TYPE_VIS
-#  define _LIBCUDACXX_TYPE_VIS
-#endif // __cuda_std__
+#  if defined(__cuda_std__)
+#    undef _LIBCUDACXX_FUNC_VIS
+#    define _LIBCUDACXX_FUNC_VIS _LIBCUDACXX_INLINE_VISIBILITY
+#    undef _LIBCUDACXX_TYPE_VIS
+#    define _LIBCUDACXX_TYPE_VIS
+#  endif // __cuda_std__
 
 // Thread API
-#ifndef _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
-#if defined(_CCCL_COMPILER_NVRTC) \
- || defined(__EMSCRIPTEN__)
-#  define _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
-#endif
-#endif // _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
-
-#ifndef _LIBCUDACXX_HAS_THREAD_API_CUDA
-#if defined(__cuda_std__)                               \
- && (defined(__CUDA_ARCH__) || defined(__EMSCRIPTEN__))
-#  define _LIBCUDACXX_HAS_THREAD_API_CUDA
-#endif // __cuda_std__
-#endif // _LIBCUDACXX_HAS_THREAD_API_CUDA
-
-#ifndef _LIBCUDACXX_HAS_THREAD_API_WIN32
-#if defined(_CCCL_COMPILER_MSVC)        \
- && !defined(_LIBCUDACXX_HAS_THREAD_API_CUDA)
-#  define _LIBCUDACXX_HAS_THREAD_API_WIN32
-#endif
-#endif // _LIBCUDACXX_HAS_THREAD_API_WIN32
-
-#if !defined(_LIBCUDACXX_HAS_NO_THREADS)          \
- && !defined(_LIBCUDACXX_HAS_THREAD_API_PTHREAD)  \
- && !defined(_LIBCUDACXX_HAS_THREAD_API_WIN32)    \
- && !defined(_LIBCUDACXX_HAS_THREAD_API_EXTERNAL)
-#  if defined(__FreeBSD__) || \
-      defined(__Fuchsia__) || \
-      defined(__wasi__) || \
-      defined(__NetBSD__) || \
-      defined(__linux__) || \
-      defined(__GNU__) || \
-      defined(__APPLE__) || \
-      defined(__CloudABI__) || \
-      defined(__sun__) || \
-      (defined(__MINGW32__) && __has_include(<pthread.h>))
-#    define _LIBCUDACXX_HAS_THREAD_API_PTHREAD
-#  elif defined(_LIBCUDACXX_WIN32API)
-#    define _LIBCUDACXX_HAS_THREAD_API_WIN32
-#  else
-#    define _LIBCUDACXX_UNSUPPORTED_THREAD_API
-#  endif // _LIBCUDACXX_HAS_THREAD_API
-#endif // _LIBCUDACXX_HAS_NO_THREADS
-
-#if defined(_LIBCUDACXX_HAS_THREAD_API_PTHREAD)
-#if defined(__ANDROID__) && __ANDROID_API__ >= 30
-#define _LIBCUDACXX_HAS_COND_CLOCKWAIT
-#elif defined(_LIBCUDACXX_GLIBC_PREREQ)
-#if _LIBCUDACXX_GLIBC_PREREQ(2, 30)
-#define _LIBCUDACXX_HAS_COND_CLOCKWAIT
-#endif
-#endif
-#endif
+#  ifndef _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
+#    if defined(_CCCL_COMPILER_NVRTC) || defined(__EMSCRIPTEN__)
+#      define _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
+#    endif
+#  endif // _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
+
+#  ifndef _LIBCUDACXX_HAS_THREAD_API_CUDA
+#    if defined(__cuda_std__) && (defined(__CUDA_ARCH__) || defined(__EMSCRIPTEN__))
+#      define _LIBCUDACXX_HAS_THREAD_API_CUDA
+#    endif // __cuda_std__
+#  endif // _LIBCUDACXX_HAS_THREAD_API_CUDA
 
-#if defined(_LIBCUDACXX_HAS_NO_THREADS) && defined(_LIBCUDACXX_HAS_THREAD_API_PTHREAD)
-#error _LIBCUDACXX_HAS_THREAD_API_PTHREAD may only be defined when \
+#  ifndef _LIBCUDACXX_HAS_THREAD_API_WIN32
+#    if defined(_CCCL_COMPILER_MSVC) && !defined(_LIBCUDACXX_HAS_THREAD_API_CUDA)
+#      define _LIBCUDACXX_HAS_THREAD_API_WIN32
+#    endif
+#  endif // _LIBCUDACXX_HAS_THREAD_API_WIN32
+
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_HAS_THREAD_API_PTHREAD) \
+    && !defined(_LIBCUDACXX_HAS_THREAD_API_WIN32) && !defined(_LIBCUDACXX_HAS_THREAD_API_EXTERNAL)
+#    if defined(__FreeBSD__) || defined(__Fuchsia__) || defined(__wasi__) || defined(__NetBSD__) || defined(__linux__) \
+      || defined(__GNU__) || defined(__APPLE__) || defined(__CloudABI__) || defined(__sun__)                           \
+      || (defined(__MINGW32__) && __has_include(<pthread.h>))
+#      define _LIBCUDACXX_HAS_THREAD_API_PTHREAD
+#    elif defined(_LIBCUDACXX_WIN32API)
+#      define _LIBCUDACXX_HAS_THREAD_API_WIN32
+#    else
+#      define _LIBCUDACXX_UNSUPPORTED_THREAD_API
+#    endif // _LIBCUDACXX_HAS_THREAD_API
+#  endif // _LIBCUDACXX_HAS_NO_THREADS
+
+#  if defined(_LIBCUDACXX_HAS_THREAD_API_PTHREAD)
+#    if defined(__ANDROID__) && __ANDROID_API__ >= 30
+#      define _LIBCUDACXX_HAS_COND_CLOCKWAIT
+#    elif defined(_LIBCUDACXX_GLIBC_PREREQ)
+#      if _LIBCUDACXX_GLIBC_PREREQ(2, 30)
+#        define _LIBCUDACXX_HAS_COND_CLOCKWAIT
+#      endif
+#    endif
+#  endif
+
+#  if defined(_LIBCUDACXX_HAS_NO_THREADS) && defined(_LIBCUDACXX_HAS_THREAD_API_PTHREAD)
+#    error _LIBCUDACXX_HAS_THREAD_API_PTHREAD may only be defined when \
        _LIBCUDACXX_HAS_NO_THREADS is not defined.
-#endif
+#  endif
 
-#if defined(_LIBCUDACXX_HAS_NO_THREADS) && defined(_LIBCUDACXX_HAS_THREAD_API_EXTERNAL)
-#error _LIBCUDACXX_HAS_THREAD_API_EXTERNAL may not be defined when \
+#  if defined(_LIBCUDACXX_HAS_NO_THREADS) && defined(_LIBCUDACXX_HAS_THREAD_API_EXTERNAL)
+#    error _LIBCUDACXX_HAS_THREAD_API_EXTERNAL may not be defined when \
        _LIBCUDACXX_HAS_NO_THREADS is defined.
-#endif
+#  endif
 
-#if defined(__STDCPP_THREADS__) && defined(_LIBCUDACXX_HAS_NO_THREADS)
-#error _LIBCUDACXX_HAS_NO_THREADS cannot be set when __STDCPP_THREADS__ is set.
-#endif
+#  if defined(__STDCPP_THREADS__) && defined(_LIBCUDACXX_HAS_NO_THREADS)
+#    error _LIBCUDACXX_HAS_NO_THREADS cannot be set when __STDCPP_THREADS__ is set.
+#  endif
 
-#if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(__STDCPP_THREADS__)
-#define __STDCPP_THREADS__ 1
-#endif
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(__STDCPP_THREADS__)
+#    define __STDCPP_THREADS__ 1
+#  endif
 
 // The glibc and Bionic implementation of pthreads implements
 // pthread_mutex_destroy as nop for regular mutexes. Additionally, Win32
@@ -1730,10 +1733,9 @@ extern "C" _LIBCUDACXX_FUNC_VIS void __sanitizer_annotate_contiguous_container(
 //
 // TODO(EricWF): Enable this optimization on Bionic after speaking to their
 //               respective stakeholders.
-#if (defined(_LIBCUDACXX_HAS_THREAD_API_PTHREAD) && defined(__GLIBC__)) \
-  || defined(_LIBCUDACXX_HAS_THREAD_API_WIN32)
-# define _LIBCUDACXX_HAS_TRIVIAL_MUTEX_DESTRUCTION
-#endif
+#  if (defined(_LIBCUDACXX_HAS_THREAD_API_PTHREAD) && defined(__GLIBC__)) || defined(_LIBCUDACXX_HAS_THREAD_API_WIN32)
+#    define _LIBCUDACXX_HAS_TRIVIAL_MUTEX_DESTRUCTION
+#  endif
 
 // Destroying a condvar is a nop on Windows.
 //
@@ -1743,123 +1745,121 @@ extern "C" _LIBCUDACXX_FUNC_VIS void __sanitizer_annotate_contiguous_container(
 //
 // TODO(EricWF): This is potentially true for some pthread implementations
 // as well.
-#if defined(_LIBCUDACXX_HAS_THREAD_API_WIN32)
-# define _LIBCUDACXX_HAS_TRIVIAL_CONDVAR_DESTRUCTION
-#endif
+#  if defined(_LIBCUDACXX_HAS_THREAD_API_WIN32)
+#    define _LIBCUDACXX_HAS_TRIVIAL_CONDVAR_DESTRUCTION
+#  endif
 
 // Systems that use capability-based security (FreeBSD with Capsicum,
 // Nuxi CloudABI) may only provide local filesystem access (using *at()).
 // Functions like open(), rename(), unlink() and stat() should not be
 // used, as they attempt to access the global filesystem namespace.
-#ifdef __CloudABI__
-#define _LIBCUDACXX_HAS_NO_GLOBAL_FILESYSTEM_NAMESPACE
-#endif
+#  ifdef __CloudABI__
+#    define _LIBCUDACXX_HAS_NO_GLOBAL_FILESYSTEM_NAMESPACE
+#  endif
 
 // CloudABI is intended for running networked services. Processes do not
 // have standard input and output channels.
-#ifdef __CloudABI__
-#define _LIBCUDACXX_HAS_NO_STDIN
-#define _LIBCUDACXX_HAS_NO_STDOUT
-#endif
+#  ifdef __CloudABI__
+#    define _LIBCUDACXX_HAS_NO_STDIN
+#    define _LIBCUDACXX_HAS_NO_STDOUT
+#  endif
 
 // Some systems do not provide gets() in their C library, for security reasons.
-#ifndef _LIBCUDACXX_C_HAS_NO_GETS
-#  if defined(_LIBCUDACXX_MSVCRT) || (defined(__FreeBSD__) && __FreeBSD__ >= 13)
-#    define _LIBCUDACXX_C_HAS_NO_GETS
+#  ifndef _LIBCUDACXX_C_HAS_NO_GETS
+#    if defined(_LIBCUDACXX_MSVCRT) || (defined(__FreeBSD__) && __FreeBSD__ >= 13)
+#      define _LIBCUDACXX_C_HAS_NO_GETS
+#    endif
 #  endif
-#endif
 
-#if defined(__BIONIC__) || defined(__CloudABI__) ||                            \
-    defined(__Fuchsia__) || defined(__wasi__) || defined(_LIBCUDACXX_HAS_MUSL_LIBC)
-#define _LIBCUDACXX_PROVIDES_DEFAULT_RUNE_TABLE
-#endif
+#  if defined(__BIONIC__) || defined(__CloudABI__) || defined(__Fuchsia__) || defined(__wasi__) \
+    || defined(_LIBCUDACXX_HAS_MUSL_LIBC)
+#    define _LIBCUDACXX_PROVIDES_DEFAULT_RUNE_TABLE
+#  endif
 
 // Thread-unsafe functions such as strtok() and localtime()
 // are not available.
-#ifdef __CloudABI__
-#define _LIBCUDACXX_HAS_NO_THREAD_UNSAFE_C_FUNCTIONS
-#endif
+#  ifdef __CloudABI__
+#    define _LIBCUDACXX_HAS_NO_THREAD_UNSAFE_C_FUNCTIONS
+#  endif
 
 // TODO: Support C11 Atomics?
 // #if __has_feature(cxx_atomic) || __has_extension(c_atomic) || __has_keyword(_Atomic)
 // #  define _LIBCUDACXX_HAS_C_ATOMIC_IMP
-#if defined(_CCCL_COMPILER_ICC)
-#  define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP
-#elif defined(_CCCL_COMPILER_CLANG)
-#  define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP
-#elif defined(_CCCL_COMPILER_GCC)
-#  define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP
-#elif defined(_CCCL_COMPILER_NVHPC)
-#  define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP
-#elif defined(_CCCL_COMPILER_MSVC)
-#  define _LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL
-#endif
+#  if defined(_CCCL_COMPILER_ICC)
+#    define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP
+#  elif defined(_CCCL_COMPILER_CLANG)
+#    define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP
+#  elif defined(_CCCL_COMPILER_GCC)
+#    define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP
+#  elif defined(_CCCL_COMPILER_NVHPC)
+#    define _LIBCUDACXX_HAS_GCC_ATOMIC_IMP
+#  elif defined(_CCCL_COMPILER_MSVC)
+#    define _LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL
+#  endif
 
 // CUDA Atomics supersede host atomics in order to insert the host/device dispatch layer
-#if defined(_CCCL_CUDA_COMPILER_NVCC) || defined(_CCCL_COMPILER_NVRTC) || defined(_CCCL_COMPILER_NVHPC) || defined(_CCCL_CUDACC)
-#  define _LIBCUDACXX_HAS_CUDA_ATOMIC_IMPL
-#endif
-
-#if (!defined(_LIBCUDACXX_HAS_C_ATOMIC_IMP) && \
-     !defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) && \
-     !defined(_LIBCUDACXX_HAS_EXTERNAL_ATOMIC_IMP)) \
-     || defined(_LIBCUDACXX_HAS_NO_THREADS)
-#  define _LIBCUDACXX_HAS_NO_ATOMIC_HEADER
-#else
-#  ifdef __cuda_std__
-#    undef _LIBCUDACXX_ATOMIC_FLAG_TYPE
-#    define _LIBCUDACXX_ATOMIC_FLAG_TYPE int
-#  endif
-#  ifndef _LIBCUDACXX_ATOMIC_FLAG_TYPE
-#    define _LIBCUDACXX_ATOMIC_FLAG_TYPE bool
+#  if defined(_CCCL_CUDA_COMPILER_NVCC) || defined(_CCCL_COMPILER_NVRTC) || defined(_CCCL_COMPILER_NVHPC) \
+    || defined(_CCCL_CUDACC)
+#    define _LIBCUDACXX_HAS_CUDA_ATOMIC_IMPL
 #  endif
-#  ifdef _LIBCUDACXX_FREESTANDING
-#    define _LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS
+
+#  if (!defined(_LIBCUDACXX_HAS_C_ATOMIC_IMP) && !defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) \
+       && !defined(_LIBCUDACXX_HAS_EXTERNAL_ATOMIC_IMP))                                  \
+    || defined(_LIBCUDACXX_HAS_NO_THREADS)
+#    define _LIBCUDACXX_HAS_NO_ATOMIC_HEADER
+#  else
+#    ifdef __cuda_std__
+#      undef _LIBCUDACXX_ATOMIC_FLAG_TYPE
+#      define _LIBCUDACXX_ATOMIC_FLAG_TYPE int
+#    endif
+#    ifndef _LIBCUDACXX_ATOMIC_FLAG_TYPE
+#      define _LIBCUDACXX_ATOMIC_FLAG_TYPE bool
+#    endif
+#    ifdef _LIBCUDACXX_FREESTANDING
+#      define _LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS
+#    endif
 #  endif
-#endif
 
-#ifndef _LIBCUDACXX_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK
-#define _LIBCUDACXX_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK
-#endif
+#  ifndef _LIBCUDACXX_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK
+#    define _LIBCUDACXX_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK
+#  endif
 
-#if defined(_LIBCUDACXX_ENABLE_THREAD_SAFETY_ANNOTATIONS)
-#  if defined(_CCCL_COMPILER_CLANG) && __has_attribute(acquire_capability)
+#  if defined(_LIBCUDACXX_ENABLE_THREAD_SAFETY_ANNOTATIONS)
+#    if defined(_CCCL_COMPILER_CLANG) && __has_attribute(acquire_capability)
 // Work around the attribute handling in clang.  When both __declspec and
 // __attribute__ are present, the processing goes awry preventing the definition
 // of the types.
-#    if !defined(_LIBCUDACXX_OBJECT_FORMAT_COFF)
-#      define _LIBCUDACXX_HAS_THREAD_SAFETY_ANNOTATIONS
+#      if !defined(_LIBCUDACXX_OBJECT_FORMAT_COFF)
+#        define _LIBCUDACXX_HAS_THREAD_SAFETY_ANNOTATIONS
+#      endif
 #    endif
 #  endif
-#endif
 
-#if __has_attribute(require_constant_initialization)
-#  define _LIBCUDACXX_SAFE_STATIC __attribute__((__require_constant_initialization__))
-#else
-#  define _LIBCUDACXX_SAFE_STATIC
-#endif
+#  if __has_attribute(require_constant_initialization)
+#    define _LIBCUDACXX_SAFE_STATIC __attribute__((__require_constant_initialization__))
+#  else
+#    define _LIBCUDACXX_SAFE_STATIC
+#  endif
 
-#if !defined(_LIBCUDACXX_HAS_NO_OFF_T_FUNCTIONS)
-#  if defined(_LIBCUDACXX_MSVCRT) || defined(_NEWLIB_VERSION)
-#    define _LIBCUDACXX_HAS_NO_OFF_T_FUNCTIONS
+#  if !defined(_LIBCUDACXX_HAS_NO_OFF_T_FUNCTIONS)
+#    if defined(_LIBCUDACXX_MSVCRT) || defined(_NEWLIB_VERSION)
+#      define _LIBCUDACXX_HAS_NO_OFF_T_FUNCTIONS
+#    endif
 #  endif
-#endif
 
-#if __has_attribute(diagnose_if) && !defined(_LIBCUDACXX_DISABLE_ADDITIONAL_DIAGNOSTICS)
-#  define _LIBCUDACXX_DIAGNOSE_WARNING(...) \
-     __attribute__((diagnose_if(__VA_ARGS__, "warning")))
-#  define _LIBCUDACXX_DIAGNOSE_ERROR(...) \
-     __attribute__((diagnose_if(__VA_ARGS__, "error")))
-#else
-#  define _LIBCUDACXX_DIAGNOSE_WARNING(...)
-#  define _LIBCUDACXX_DIAGNOSE_ERROR(...)
-#endif
+#  if __has_attribute(diagnose_if) && !defined(_LIBCUDACXX_DISABLE_ADDITIONAL_DIAGNOSTICS)
+#    define _LIBCUDACXX_DIAGNOSE_WARNING(...) __attribute__((diagnose_if(__VA_ARGS__, "warning")))
+#    define _LIBCUDACXX_DIAGNOSE_ERROR(...)   __attribute__((diagnose_if(__VA_ARGS__, "error")))
+#  else
+#    define _LIBCUDACXX_DIAGNOSE_WARNING(...)
+#    define _LIBCUDACXX_DIAGNOSE_ERROR(...)
+#  endif
 
-#if __has_attribute(__nodebug__)
-#define _LIBCUDACXX_NODEBUG __attribute__((__nodebug__))
-#else
-#define _LIBCUDACXX_NODEBUG
-#endif
+#  if __has_attribute(__nodebug__)
+#    define _LIBCUDACXX_NODEBUG __attribute__((__nodebug__))
+#  else
+#    define _LIBCUDACXX_NODEBUG
+#  endif
 
 #  if __has_attribute(__preferred_name__)
 #    define _LIBCUDACXX_PREFERRED_NAME(x) __attribute__((__preferred_name__(x)))
@@ -1867,47 +1867,46 @@ extern "C" _LIBCUDACXX_FUNC_VIS void __sanitizer_annotate_contiguous_container(
 #    define _LIBCUDACXX_PREFERRED_NAME(x)
 #  endif
 
-#if defined(_LIBCUDACXX_ABI_MICROSOFT) && \
-    (defined(_CCCL_COMPILER_MSVC) || __has_declspec_attribute(empty_bases))
-#  define _LIBCUDACXX_DECLSPEC_EMPTY_BASES __declspec(empty_bases)
-#else
-#  define _LIBCUDACXX_DECLSPEC_EMPTY_BASES
-#endif
+#  if defined(_LIBCUDACXX_ABI_MICROSOFT) && (defined(_CCCL_COMPILER_MSVC) || __has_declspec_attribute(empty_bases))
+#    define _LIBCUDACXX_DECLSPEC_EMPTY_BASES __declspec(empty_bases)
+#  else
+#    define _LIBCUDACXX_DECLSPEC_EMPTY_BASES
+#  endif
 
-#if defined(_LIBCUDACXX_ENABLE_CXX17_REMOVED_FEATURES)
-#define _LIBCUDACXX_ENABLE_CXX17_REMOVED_AUTO_PTR
-#define _LIBCUDACXX_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS
-#define _LIBCUDACXX_ENABLE_CXX17_REMOVED_RANDOM_SHUFFLE
-#define _LIBCUDACXX_ENABLE_CXX17_REMOVED_BINDERS
-#endif // _LIBCUDACXX_ENABLE_CXX17_REMOVED_FEATURES
+#  if defined(_LIBCUDACXX_ENABLE_CXX17_REMOVED_FEATURES)
+#    define _LIBCUDACXX_ENABLE_CXX17_REMOVED_AUTO_PTR
+#    define _LIBCUDACXX_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS
+#    define _LIBCUDACXX_ENABLE_CXX17_REMOVED_RANDOM_SHUFFLE
+#    define _LIBCUDACXX_ENABLE_CXX17_REMOVED_BINDERS
+#  endif // _LIBCUDACXX_ENABLE_CXX17_REMOVED_FEATURES
 
-#if !defined(__cpp_deduction_guides) || __cpp_deduction_guides < 201611
-#define _LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES
-#endif
+#  if !defined(__cpp_deduction_guides) || __cpp_deduction_guides < 201611
+#    define _LIBCUDACXX_HAS_NO_DEDUCTION_GUIDES
+#  endif
 
-#if !defined(__cpp_coroutines) || __cpp_coroutines < 201703L
-#define _LIBCUDACXX_HAS_NO_COROUTINES
-#endif
+#  if !defined(__cpp_coroutines) || __cpp_coroutines < 201703L
+#    define _LIBCUDACXX_HAS_NO_COROUTINES
+#  endif
 
 // We need `is_constant_evaluated` for clang and gcc. MSVC also needs extensive rework
-#if !defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
-#define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
-#elif defined(_CCCL_COMPILER_NVRTC)
-#define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
-#elif defined(_CCCL_COMPILER_MSVC)
-#define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
-#elif defined(_CCCL_CUDACC_BELOW_11_8)
-#define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
-#elif defined(_CCCL_CUDA_COMPILER_CLANG)
-#define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
-#endif
+#  if !defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+#    define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
+#  elif defined(_CCCL_COMPILER_NVRTC)
+#    define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
+#  elif defined(_CCCL_COMPILER_MSVC)
+#    define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
+#  elif defined(_CCCL_CUDACC_BELOW_11_8)
+#    define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
+#  elif defined(_CCCL_CUDA_COMPILER_CLANG)
+#    define _LIBCUDACXX_HAS_NO_CONSTEXPR_COMPLEX_OPERATIONS
+#  endif
 
 // FIXME: Correct this macro when either (A) a feature test macro for the
 // spaceship operator is provided, or (B) a compiler provides a complete
 // implementation.
-#define _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
+#  define _LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR
 
-#define _LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
+#  define _LIBCUDACXX_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS
 
 // The stream API was dropped and re-added in the dylib shipped on macOS
 // and iOS. We can only assume the dylib to provide these definitions for
@@ -1916,123 +1915,114 @@ extern "C" _LIBCUDACXX_FUNC_VIS void __sanitizer_annotate_contiguous_container(
 // declarations for streams exist conditionally to this; if we provide
 // an explicit instantiation declaration and we try to deploy to a dylib
 // that does not provide those symbols, we'll get a load-time error.
-#if !defined(_LIBCUDACXX_BUILDING_LIBRARY) &&                                      \
-    ((defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) &&                \
-      __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 1090) ||                 \
-     (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) &&               \
-      __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 70000))
-#  define _LIBCUDACXX_DO_NOT_ASSUME_STREAMS_EXPLICIT_INSTANTIATION_IN_DYLIB
-#endif
-
-#if defined(_LIBCUDACXX_HAS_NO_PRAGMA_PUSH_POP_MACRO)
-#  define _LIBCUDACXX_PUSH_MACROS
-#  define _LIBCUDACXX_POP_MACROS
-#else
-  // Don't warn about macro conflicts when we can restore them at the
-  // end of the header.
-#  ifndef _LIBCUDACXX_DISABLE_MACRO_CONFLICT_WARNINGS
-#    define _LIBCUDACXX_DISABLE_MACRO_CONFLICT_WARNINGS
+#  if !defined(_LIBCUDACXX_BUILDING_LIBRARY)                        \
+    && ((defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)     \
+         && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 1090)   \
+        || (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) \
+            && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 70000))
+#    define _LIBCUDACXX_DO_NOT_ASSUME_STREAMS_EXPLICIT_INSTANTIATION_IN_DYLIB
 #  endif
-#  if defined(_CCCL_COMPILER_MSVC)
-#    define _LIBCUDACXX_PUSH_MACROS    \
-       __pragma(push_macro("min")) \
-       __pragma(push_macro("max"))
-#    define _LIBCUDACXX_POP_MACROS     \
-       __pragma(pop_macro("min"))  \
-       __pragma(pop_macro("max"))
+
+#  if defined(_LIBCUDACXX_HAS_NO_PRAGMA_PUSH_POP_MACRO)
+#    define _LIBCUDACXX_PUSH_MACROS
+#    define _LIBCUDACXX_POP_MACROS
 #  else
-#    define _LIBCUDACXX_PUSH_MACROS        \
-       _Pragma("push_macro(\"min\")")  \
-       _Pragma("push_macro(\"max\")")
-#    define _LIBCUDACXX_POP_MACROS         \
-       _Pragma("pop_macro(\"min\")")   \
-       _Pragma("pop_macro(\"max\")")
-#  endif
-#endif // defined(_LIBCUDACXX_HAS_NO_PRAGMA_PUSH_POP_MACRO)
-
-#if !defined(_LIBCUDACXX_NO_AUTO_LINK) && !defined(__cuda_std__)
-#  if defined(_LIBCUDACXX_ABI_MICROSOFT) && !defined(_LIBCUDACXX_BUILDING_LIBRARY)
-#    if defined(_DLL)
-#      pragma comment(lib, "c++.lib")
+// Don't warn about macro conflicts when we can restore them at the
+// end of the header.
+#    ifndef _LIBCUDACXX_DISABLE_MACRO_CONFLICT_WARNINGS
+#      define _LIBCUDACXX_DISABLE_MACRO_CONFLICT_WARNINGS
+#    endif
+#    if defined(_CCCL_COMPILER_MSVC)
+#      define _LIBCUDACXX_PUSH_MACROS __pragma(push_macro("min")) __pragma(push_macro("max"))
+#      define _LIBCUDACXX_POP_MACROS  __pragma(pop_macro("min")) __pragma(pop_macro("max"))
 #    else
-#      pragma comment(lib, "libc++.lib")
+#      define _LIBCUDACXX_PUSH_MACROS _Pragma("push_macro(\"min\")") _Pragma("push_macro(\"max\")")
+#      define _LIBCUDACXX_POP_MACROS  _Pragma("pop_macro(\"min\")") _Pragma("pop_macro(\"max\")")
 #    endif
-#  endif // defined(_LIBCUDACXX_ABI_MICROSOFT) && !defined(_LIBCUDACXX_BUILDING_LIBRARY)
-#endif // !defined(_LIBCUDACXX_NO_AUTO_LINK)
+#  endif // defined(_LIBCUDACXX_HAS_NO_PRAGMA_PUSH_POP_MACRO)
+
+#  if !defined(_LIBCUDACXX_NO_AUTO_LINK) && !defined(__cuda_std__)
+#    if defined(_LIBCUDACXX_ABI_MICROSOFT) && !defined(_LIBCUDACXX_BUILDING_LIBRARY)
+#      if defined(_DLL)
+#        pragma comment(lib, "c++.lib")
+#      else
+#        pragma comment(lib, "libc++.lib")
+#      endif
+#    endif // defined(_LIBCUDACXX_ABI_MICROSOFT) && !defined(_LIBCUDACXX_BUILDING_LIBRARY)
+#  endif // !defined(_LIBCUDACXX_NO_AUTO_LINK)
 
-#define _LIBCUDACXX_UNUSED_VAR(x) ((void)(x))
+#  define _LIBCUDACXX_UNUSED_VAR(x) ((void) (x))
 
 // Configures the fopen close-on-exec mode character, if any. This string will
 // be appended to any mode string used by fstream for fopen/fdopen.
 //
 // Not all platforms support this, but it helps avoid fd-leaks on platforms that
 // do.
-#if defined(__BIONIC__)
-#  define _LIBCUDACXX_FOPEN_CLOEXEC_MODE "e"
-#else
-#  define _LIBCUDACXX_FOPEN_CLOEXEC_MODE
-#endif
+#  if defined(__BIONIC__)
+#    define _LIBCUDACXX_FOPEN_CLOEXEC_MODE "e"
+#  else
+#    define _LIBCUDACXX_FOPEN_CLOEXEC_MODE
+#  endif
 
 #  if __has_attribute(__format__)
 // The attribute uses 1-based indices for ordinary and static member functions.
 // The attribute uses 2-based indices for non-static member functions.
-#    define _LIBCUDACXX_ATTRIBUTE_FORMAT(archetype, format_string_index, first_format_arg_index)                           \
+#    define _LIBCUDACXX_ATTRIBUTE_FORMAT(archetype, format_string_index, first_format_arg_index) \
       __attribute__((__format__(archetype, format_string_index, first_format_arg_index)))
 #  else
 #    define _LIBCUDACXX_ATTRIBUTE_FORMAT(archetype, format_string_index, first_format_arg_index) /* nothing */
 #  endif
 
-#ifndef _LIBCUDACXX_SYS_CLOCK_DURATION
-#if defined(__cuda_std__)
-#  define _LIBCUDACXX_SYS_CLOCK_DURATION nanoseconds
-#else
-#  define _LIBCUDACXX_SYS_CLOCK_DURATION microseconds
-#endif
-#endif // _LIBCUDACXX_SYS_CLOCK_DURATION
+#  ifndef _LIBCUDACXX_SYS_CLOCK_DURATION
+#    if defined(__cuda_std__)
+#      define _LIBCUDACXX_SYS_CLOCK_DURATION nanoseconds
+#    else
+#      define _LIBCUDACXX_SYS_CLOCK_DURATION microseconds
+#    endif
+#  endif // _LIBCUDACXX_SYS_CLOCK_DURATION
 
 // There are a handful of public standard library types that are intended to
 // support CTAD but don't need any explicit deduction guides to do so. This
 // macro is used to mark them as such, which suppresses the
 // '-Wctad-maybe-unsupported' compiler warning when CTAD is used in user code
 // with these classes.
-#if (!defined(_CCCL_COMPILER_GCC) || __GNUC__ > 6) \
- && _CCCL_STD_VER >= 2017
-#  define _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(_ClassName)                                                              \
-      template <class ..._Tag>                                                                                         \
-      _ClassName(typename _Tag::__allow_ctad...) -> _ClassName<_Tag...>
-#else
-#  define _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(_ClassName) static_assert(true, "")
-#endif
+#  if (!defined(_CCCL_COMPILER_GCC) || __GNUC__ > 6) && _CCCL_STD_VER >= 2017
+#    define _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(_ClassName) \
+      template <class... _Tag>                              \
+      _ClassName(typename _Tag::__allow_ctad...)->_ClassName<_Tag...>
+#  else
+#    define _LIBCUDACXX_CTAD_SUPPORTED_FOR_TYPE(_ClassName) static_assert(true, "")
+#  endif
 
-#if (defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ <= 11) \
- && (defined(__CUDACC_VER_MINOR__) && __CUDACC_VER_MINOR__ <= 2)
-#  define _LIBCUDACXX_CONSTEXPR_GLOBAL const
-#else
-#  define _LIBCUDACXX_CONSTEXPR_GLOBAL constexpr
-#endif
+#  if (defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ <= 11) \
+    && (defined(__CUDACC_VER_MINOR__) && __CUDACC_VER_MINOR__ <= 2)
+#    define _LIBCUDACXX_CONSTEXPR_GLOBAL const
+#  else
+#    define _LIBCUDACXX_CONSTEXPR_GLOBAL constexpr
+#  endif
 
-#if defined(__CUDA_ARCH__)
-#  define _LIBCUDACXX_CPO_ACCESSIBILITY _CCCL_DEVICE _LIBCUDACXX_CONSTEXPR_GLOBAL
-#else
-#  define _LIBCUDACXX_CPO_ACCESSIBILITY _LIBCUDACXX_INLINE_VAR constexpr
-#endif
+#  if defined(__CUDA_ARCH__)
+#    define _LIBCUDACXX_CPO_ACCESSIBILITY _CCCL_DEVICE _LIBCUDACXX_CONSTEXPR_GLOBAL
+#  else
+#    define _LIBCUDACXX_CPO_ACCESSIBILITY _LIBCUDACXX_INLINE_VAR constexpr
+#  endif
 
-#if _CCCL_STD_VER > 2014
-#  define _LIBCUDACXX_TRAIT(__TRAIT, ...) __TRAIT##_v<__VA_ARGS__>
-#else
-#  define _LIBCUDACXX_TRAIT(__TRAIT, ...) __TRAIT<__VA_ARGS__>::value
-#endif
+#  if _CCCL_STD_VER > 2014
+#    define _LIBCUDACXX_TRAIT(__TRAIT, ...) __TRAIT##_v<__VA_ARGS__>
+#  else
+#    define _LIBCUDACXX_TRAIT(__TRAIT, ...) __TRAIT<__VA_ARGS__>::value
+#  endif
 
 // Older nvcc do not handle the constraint of `construct_at` in earlier std modes
 // So to preserve our performance optimization we default to the unconstrained
 // `__construct_at` and only in C++20 use `construct_at`
-#if _CCCL_STD_VER > 2017
-#  define _LIBCUDACXX_CONSTRUCT_AT(_LOCATION, ...) \
-  _CUDA_VSTD::construct_at(_CUDA_VSTD::addressof(_LOCATION), __VA_ARGS__)
-#else
-#  define _LIBCUDACXX_CONSTRUCT_AT(_LOCATION, ...) \
-  _CUDA_VSTD::__construct_at(_CUDA_VSTD::addressof(_LOCATION), __VA_ARGS__)
-#endif
+#  if _CCCL_STD_VER > 2017
+#    define _LIBCUDACXX_CONSTRUCT_AT(_LOCATION, ...) \
+      _CUDA_VSTD::construct_at(_CUDA_VSTD::addressof(_LOCATION), __VA_ARGS__)
+#  else
+#    define _LIBCUDACXX_CONSTRUCT_AT(_LOCATION, ...) \
+      _CUDA_VSTD::__construct_at(_CUDA_VSTD::addressof(_LOCATION), __VA_ARGS__)
+#  endif
 
 // We can only expose constexpr allocations if the compiler supports it
 #  if defined(__cpp_constexpr_dynamic_alloc) && defined(__cpp_lib_constexpr_dynamic_alloc) && _CCCL_STD_VER >= 2020 \
@@ -2061,7 +2051,7 @@ extern "C" _LIBCUDACXX_FUNC_VIS void __sanitizer_annotate_contiguous_container(
       constexpr __class() noexcept = default;
 #  endif // !_CCCL_COMPILER_NVRTC || nvcc >= 11.3
 
-#define _LIBCUDACXX_HAS_NO_INCOMPLETE_RANGES
+#  define _LIBCUDACXX_HAS_NO_INCOMPLETE_RANGES
 
 #endif // __cplusplus
 
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__pragma_pop b/libcudacxx/include/cuda/std/detail/libcxx/include/__pragma_pop
index 27a9a68b4e6..5bd85a09940 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__pragma_pop
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__pragma_pop
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #if defined(_LIBCUDACXX_USE_PRAGMA_MSVC_WARNING)
-  #pragma warning(pop)
+#  pragma warning(pop)
 #endif
 
 #if defined(_LIBCUDACXX_POP_MACROS)
-  _LIBCUDACXX_POP_MACROS
+_LIBCUDACXX_POP_MACROS
 #endif
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/atomic b/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
index 298b69726f9..2d0a2e56af6 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/atomic
@@ -556,9 +556,6 @@ void atomic_signal_fence(memory_order m) noexcept;
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
-#include <cuda/std/detail/libcxx/include/__debug>
-#include <cuda/std/detail/libcxx/include/__threading_support>
 #include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_assignable.h>
@@ -568,42 +565,42 @@ void atomic_signal_fence(memory_order m) noexcept;
 #include <cuda/std/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__type_traits/underlying_type.h>
 #include <cuda/std/__utility/forward.h>
-#include <cuda/std/detail/libcxx/include/cstring>
 #include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
+#include <cuda/std/detail/libcxx/include/__debug>
+#include <cuda/std/detail/libcxx/include/__threading_support>
+#include <cuda/std/detail/libcxx/include/cstring>
 #include <cuda/std/type_traits>
 #include <cuda/std/version>
 
 #include <cuda/std/detail/libcxx/include/__pragma_push>
 
 #ifdef _LIBCUDACXX_HAS_NO_THREADS
-# error <atomic> is not supported on this single threaded system
+#  error <atomic> is not supported on this single threaded system
 #endif
 #ifdef _LIBCUDACXX_HAS_NO_ATOMIC_HEADER
-# error <atomic> is not implemented
+#  error <atomic> is not implemented
 #endif
 #ifdef _LIBCUDACXX_UNSUPPORTED_THREAD_API
-# error "<atomic> is not supported on this system"
+#  error "<atomic> is not supported on this system"
 #endif
 #ifdef kill_dependency
-# error C++ standard library is incompatible with <stdatomic.h>
+#  error C++ standard library is incompatible with <stdatomic.h>
 #endif
 
-#define _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m) \
-  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_consume || \
-                           __m == memory_order_acquire || \
-                           __m == memory_order_acq_rel,   \
-                        "memory order argument to atomic operation is invalid")
+#define _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)                                              \
+  _LIBCUDACXX_DIAGNOSE_WARNING(                                                                \
+    __m == memory_order_consume || __m == memory_order_acquire || __m == memory_order_acq_rel, \
+    "memory order argument to atomic operation is invalid")
 
-#define _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m) \
-  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_release || \
-                           __m == memory_order_acq_rel,   \
-                        "memory order argument to atomic operation is invalid")
+#define _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)                                           \
+  _LIBCUDACXX_DIAGNOSE_WARNING(__m == memory_order_release || __m == memory_order_acq_rel, \
+                               "memory order argument to atomic operation is invalid")
 
-#define _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__m, __f) \
-  _LIBCUDACXX_DIAGNOSE_WARNING(__f == memory_order_release || \
-                           __f == memory_order_acq_rel,   \
-                        "memory order argument to atomic operation is invalid")
+#define _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__m, __f)                                  \
+  _LIBCUDACXX_DIAGNOSE_WARNING(__f == memory_order_release || __f == memory_order_acq_rel, \
+                               "memory order argument to atomic operation is invalid")
 
 #if defined(_LIBCUDACXX_HAS_MSVC_ATOMIC_IMPL)
 #  include <intrin.h>
@@ -614,25 +611,25 @@ void atomic_signal_fence(memory_order m) noexcept;
 #endif
 
 #if !defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-#define ATOMIC_BOOL_LOCK_FREE      2
-#define ATOMIC_CHAR_LOCK_FREE      2
-#define ATOMIC_CHAR16_T_LOCK_FREE  2
-#define ATOMIC_CHAR32_T_LOCK_FREE  2
-#define ATOMIC_WCHAR_T_LOCK_FREE   2
-#define ATOMIC_SHORT_LOCK_FREE     2
-#define ATOMIC_INT_LOCK_FREE       2
-#define ATOMIC_LONG_LOCK_FREE      2
-#define ATOMIC_LLONG_LOCK_FREE     2
-#define ATOMIC_POINTER_LOCK_FREE   2
-#endif //!defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
+#  define ATOMIC_BOOL_LOCK_FREE     2
+#  define ATOMIC_CHAR_LOCK_FREE     2
+#  define ATOMIC_CHAR16_T_LOCK_FREE 2
+#  define ATOMIC_CHAR32_T_LOCK_FREE 2
+#  define ATOMIC_WCHAR_T_LOCK_FREE  2
+#  define ATOMIC_SHORT_LOCK_FREE    2
+#  define ATOMIC_INT_LOCK_FREE      2
+#  define ATOMIC_LONG_LOCK_FREE     2
+#  define ATOMIC_LLONG_LOCK_FREE    2
+#  define ATOMIC_POINTER_LOCK_FREE  2
+#endif //! defined(__CLANG_ATOMIC_BOOL_LOCK_FREE) && !defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
 
 #ifndef __ATOMIC_RELAXED
-#define __ATOMIC_RELAXED 0
-#define __ATOMIC_CONSUME 1
-#define __ATOMIC_ACQUIRE 2
-#define __ATOMIC_RELEASE 3
-#define __ATOMIC_ACQ_REL 4
-#define __ATOMIC_SEQ_CST 5
+#  define __ATOMIC_RELAXED 0
+#  define __ATOMIC_CONSUME 1
+#  define __ATOMIC_ACQUIRE 2
+#  define __ATOMIC_RELEASE 3
+#  define __ATOMIC_ACQ_REL 4
+#  define __ATOMIC_SEQ_CST 5
 #endif //__ATOMIC_RELAXED
 
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
@@ -640,20 +637,22 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 // Figure out what the underlying type for `memory_order` would be if it were
 // declared as an unscoped enum (accounting for -fshort-enums). Use this result
 // to pin the underlying type in C++20.
-enum __legacy_memory_order {
-    __mo_relaxed,
-    __mo_consume,
-    __mo_acquire,
-    __mo_release,
-    __mo_acq_rel,
-    __mo_seq_cst
+enum __legacy_memory_order
+{
+  __mo_relaxed,
+  __mo_consume,
+  __mo_acquire,
+  __mo_release,
+  __mo_acq_rel,
+  __mo_seq_cst
 };
 
 typedef underlying_type<__legacy_memory_order>::type __memory_order_underlying_t;
 
 #if _CCCL_STD_VER > 2017
 
-enum class memory_order : __memory_order_underlying_t {
+enum class memory_order : __memory_order_underlying_t
+{
   relaxed = __mo_relaxed,
   consume = __mo_consume,
   acquire = __mo_acquire,
@@ -671,7 +670,8 @@ inline constexpr auto memory_order_seq_cst = memory_order::seq_cst;
 
 #else
 
-typedef enum memory_order {
+typedef enum memory_order
+{
   memory_order_relaxed = __mo_relaxed,
   memory_order_consume = __mo_consume,
   memory_order_acquire = __mo_acquire,
@@ -682,43 +682,48 @@ typedef enum memory_order {
 
 #endif // _CCCL_STD_VER > 2017
 
-template <typename _Tp> _LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs) {
+template <typename _Tp>
+_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_nonatomic_compare_equal(_Tp const& __lhs, _Tp const& __rhs)
+{
 #if defined(_CCCL_CUDA_COMPILER)
-    return __lhs == __rhs;
+  return __lhs == __rhs;
 #else
-    return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
+  return memcmp(&__lhs, &__rhs, sizeof(_Tp)) == 0;
 #endif
 }
 
 static_assert((is_same<underlying_type<memory_order>::type, __memory_order_underlying_t>::value),
-  "unexpected underlying type for std::memory_order");
+              "unexpected underlying type for std::memory_order");
 
-#if defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) || \
-    defined(_LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS)
+#if defined(_LIBCUDACXX_HAS_GCC_ATOMIC_IMP) || defined(_LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS)
 
 // [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because
 // the default operator= in an object is not volatile, a byte-by-byte copy
 // is required.
-template <typename _Tp, typename _Tv> _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t<is_assignable<_Tp&, _Tv>::value>
-__cxx_atomic_assign_volatile(_Tp& __a_value, _Tv const& __val) {
+template <typename _Tp, typename _Tv>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_assignable<_Tp&, _Tv>::value>
+__cxx_atomic_assign_volatile(_Tp& __a_value, _Tv const& __val)
+{
   __a_value = __val;
 }
-template <typename _Tp, typename _Tv> _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t<is_assignable<_Tp&, _Tv>::value>
-__cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val) {
-  volatile char* __to = reinterpret_cast<volatile char*>(&__a_value);
-  volatile char* __end = __to + sizeof(_Tp);
+template <typename _Tp, typename _Tv>
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_assignable<_Tp&, _Tv>::value>
+__cxx_atomic_assign_volatile(_Tp volatile& __a_value, _Tv volatile const& __val)
+{
+  volatile char* __to         = reinterpret_cast<volatile char*>(&__a_value);
+  volatile char* __end        = __to + sizeof(_Tp);
   volatile const char* __from = reinterpret_cast<volatile const char*>(&__val);
   while (__to != __end)
+  {
     *__to++ = *__from++;
+  }
 }
 
 #endif
 
 // Headers are wrapped like so: (cuda::std::|std::)detail
-namespace __detail {
+namespace __detail
+{
 #if defined(_LIBCUDACXX_HAS_CUDA_ATOMIC_EXT)
 #  include <cuda/std/detail/libcxx/include/support/atomic/atomic_scopes.h>
 #endif
@@ -733,91 +738,98 @@ namespace __detail {
 // TODO: Maybe support C11 atomics?
 // #include <cuda/std/detail/libcxx/include/support/atomic/atomic_c11.h>
 #endif // _LIBCUDACXX_HAS_GCC_ATOMIC_IMP, _LIBCUDACXX_HAS_C_ATOMIC_IMP
-}
+} // namespace __detail
 
 using __detail::__cxx_atomic_base_impl;
-using __detail::__cxx_atomic_ref_base_impl;
-using __detail::__cxx_atomic_thread_fence;
-using __detail::__cxx_atomic_signal_fence;
-using __detail::__cxx_atomic_load;
-using __detail::__cxx_atomic_store;
-using __detail::__cxx_atomic_exchange;
-using __detail::__cxx_atomic_compare_exchange_weak;
 using __detail::__cxx_atomic_compare_exchange_strong;
+using __detail::__cxx_atomic_compare_exchange_weak;
+using __detail::__cxx_atomic_exchange;
 using __detail::__cxx_atomic_fetch_add;
-using __detail::__cxx_atomic_fetch_sub;
-using __detail::__cxx_atomic_fetch_or;
 using __detail::__cxx_atomic_fetch_and;
+using __detail::__cxx_atomic_fetch_or;
+using __detail::__cxx_atomic_fetch_sub;
 using __detail::__cxx_atomic_fetch_xor;
+using __detail::__cxx_atomic_load;
+using __detail::__cxx_atomic_ref_base_impl;
+using __detail::__cxx_atomic_signal_fence;
+using __detail::__cxx_atomic_store;
+using __detail::__cxx_atomic_thread_fence;
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp kill_dependency(_Tp __y) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp kill_dependency(_Tp __y) noexcept
 {
-    return __y;
+  return __y;
 }
 
 #if defined(__CLANG_ATOMIC_BOOL_LOCK_FREE)
-# define ATOMIC_BOOL_LOCK_FREE      __CLANG_ATOMIC_BOOL_LOCK_FREE
-# define ATOMIC_CHAR_LOCK_FREE      __CLANG_ATOMIC_CHAR_LOCK_FREE
-# define ATOMIC_CHAR16_T_LOCK_FREE  __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
-# define ATOMIC_CHAR32_T_LOCK_FREE  __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
-# define ATOMIC_WCHAR_T_LOCK_FREE   __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
-# define ATOMIC_SHORT_LOCK_FREE     __CLANG_ATOMIC_SHORT_LOCK_FREE
-# define ATOMIC_INT_LOCK_FREE       __CLANG_ATOMIC_INT_LOCK_FREE
-# define ATOMIC_LONG_LOCK_FREE      __CLANG_ATOMIC_LONG_LOCK_FREE
-# define ATOMIC_LLONG_LOCK_FREE     __CLANG_ATOMIC_LLONG_LOCK_FREE
-# define ATOMIC_POINTER_LOCK_FREE   __CLANG_ATOMIC_POINTER_LOCK_FREE
+#  define ATOMIC_BOOL_LOCK_FREE     __CLANG_ATOMIC_BOOL_LOCK_FREE
+#  define ATOMIC_CHAR_LOCK_FREE     __CLANG_ATOMIC_CHAR_LOCK_FREE
+#  define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
+#  define ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
+#  define ATOMIC_WCHAR_T_LOCK_FREE  __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
+#  define ATOMIC_SHORT_LOCK_FREE    __CLANG_ATOMIC_SHORT_LOCK_FREE
+#  define ATOMIC_INT_LOCK_FREE      __CLANG_ATOMIC_INT_LOCK_FREE
+#  define ATOMIC_LONG_LOCK_FREE     __CLANG_ATOMIC_LONG_LOCK_FREE
+#  define ATOMIC_LLONG_LOCK_FREE    __CLANG_ATOMIC_LLONG_LOCK_FREE
+#  define ATOMIC_POINTER_LOCK_FREE  __CLANG_ATOMIC_POINTER_LOCK_FREE
 #elif defined(__GCC_ATOMIC_BOOL_LOCK_FREE)
-# define ATOMIC_BOOL_LOCK_FREE      __GCC_ATOMIC_BOOL_LOCK_FREE
-# define ATOMIC_CHAR_LOCK_FREE      __GCC_ATOMIC_CHAR_LOCK_FREE
-# define ATOMIC_CHAR16_T_LOCK_FREE  __GCC_ATOMIC_CHAR16_T_LOCK_FREE
-# define ATOMIC_CHAR32_T_LOCK_FREE  __GCC_ATOMIC_CHAR32_T_LOCK_FREE
-# define ATOMIC_WCHAR_T_LOCK_FREE   __GCC_ATOMIC_WCHAR_T_LOCK_FREE
-# define ATOMIC_SHORT_LOCK_FREE     __GCC_ATOMIC_SHORT_LOCK_FREE
-# define ATOMIC_INT_LOCK_FREE       __GCC_ATOMIC_INT_LOCK_FREE
-# define ATOMIC_LONG_LOCK_FREE      __GCC_ATOMIC_LONG_LOCK_FREE
-# define ATOMIC_LLONG_LOCK_FREE     __GCC_ATOMIC_LLONG_LOCK_FREE
-# define ATOMIC_POINTER_LOCK_FREE   __GCC_ATOMIC_POINTER_LOCK_FREE
+#  define ATOMIC_BOOL_LOCK_FREE     __GCC_ATOMIC_BOOL_LOCK_FREE
+#  define ATOMIC_CHAR_LOCK_FREE     __GCC_ATOMIC_CHAR_LOCK_FREE
+#  define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE
+#  define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE
+#  define ATOMIC_WCHAR_T_LOCK_FREE  __GCC_ATOMIC_WCHAR_T_LOCK_FREE
+#  define ATOMIC_SHORT_LOCK_FREE    __GCC_ATOMIC_SHORT_LOCK_FREE
+#  define ATOMIC_INT_LOCK_FREE      __GCC_ATOMIC_INT_LOCK_FREE
+#  define ATOMIC_LONG_LOCK_FREE     __GCC_ATOMIC_LONG_LOCK_FREE
+#  define ATOMIC_LLONG_LOCK_FREE    __GCC_ATOMIC_LLONG_LOCK_FREE
+#  define ATOMIC_POINTER_LOCK_FREE  __GCC_ATOMIC_POINTER_LOCK_FREE
 #endif
 
 #ifdef _LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS
 
-template<typename _Tp, int _Sco>
-struct __cxx_atomic_lock_impl {
-
-  _LIBCUDACXX_INLINE_VISIBILITY
-  __cxx_atomic_lock_impl() noexcept
-    : __a_value(), __a_lock(0) {}
-  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit
-  __cxx_atomic_lock_impl(_Tp value) noexcept
-    : __a_value(value), __a_lock(0) {}
+template <typename _Tp, int _Sco>
+struct __cxx_atomic_lock_impl
+{
+  _LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_lock_impl() noexcept
+      : __a_value()
+      , __a_lock(0)
+  {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __cxx_atomic_lock_impl(_Tp value) noexcept
+      : __a_value(value)
+      , __a_lock(0)
+  {}
 
   _Tp __a_value;
   mutable __cxx_atomic_base_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, _Sco> __a_lock;
 
-  _LIBCUDACXX_INLINE_VISIBILITY void __lock() const volatile {
-    while(1 == __cxx_atomic_exchange(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-        /*spin*/;
+  _LIBCUDACXX_INLINE_VISIBILITY void __lock() const volatile
+  {
+    while (1 == __cxx_atomic_exchange(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
+      /*spin*/;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void __lock() const {
-    while(1 == __cxx_atomic_exchange(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-        /*spin*/;
+  _LIBCUDACXX_INLINE_VISIBILITY void __lock() const
+  {
+    while (1 == __cxx_atomic_exchange(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
+      /*spin*/;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void __unlock() const volatile {
+  _LIBCUDACXX_INLINE_VISIBILITY void __unlock() const volatile
+  {
     __cxx_atomic_store(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY void __unlock() const {
+  _LIBCUDACXX_INLINE_VISIBILITY void __unlock() const
+  {
     __cxx_atomic_store(&__a_lock, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), memory_order_release);
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp __read() const volatile {
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp __read() const volatile
+  {
     __lock();
     _Tp __old;
     __cxx_atomic_assign_volatile(__old, __a_value);
     __unlock();
     return __old;
   }
-  _LIBCUDACXX_INLINE_VISIBILITY _Tp __read() const {
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp __read() const
+  {
     __lock();
     _Tp __old = __a_value;
     __unlock();
@@ -826,45 +838,47 @@ struct __cxx_atomic_lock_impl {
 };
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-void __cxx_atomic_init(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,  _Tp __val) {
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_init(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val)
+{
   __cxx_atomic_assign_volatile(__a->__a_value, __val);
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-void __cxx_atomic_init(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,  _Tp __val) {
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_init(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val)
+{
   __a->__a_value = __val;
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-void __cxx_atomic_store(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,  _Tp __val, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY void
+__cxx_atomic_store(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val, memory_order)
+{
   __a->__lock();
   __cxx_atomic_assign_volatile(__a->__a_value, __val);
   __a->__unlock();
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-void __cxx_atomic_store(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,  _Tp __val, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_store(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __val, memory_order)
+{
   __a->__lock();
   __a->__a_value = __val;
   __a->__unlock();
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp __cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, memory_order)
+{
   return __a->__read();
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp, _Sco>* __a, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp, _Sco>* __a, memory_order)
+{
   return __a->__read();
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_exchange(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __value, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_exchange(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __value, memory_order)
+{
   __a->__lock();
   _Tp __old;
   __cxx_atomic_assign_volatile(__old, __a->__a_value);
@@ -873,77 +887,94 @@ _Tp __cxx_atomic_exchange(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp _
   return __old;
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_exchange(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __value, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_exchange(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __value, memory_order)
+{
   __a->__lock();
-  _Tp __old = __a->__a_value;
+  _Tp __old      = __a->__a_value;
   __a->__a_value = __value;
   __a->__unlock();
   return __old;
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_atomic_compare_exchange_strong(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                                          _Tp* __expected, _Tp __value, memory_order, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong(
+  volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order)
+{
   __a->__lock();
   _Tp __temp;
   __cxx_atomic_assign_volatile(__temp, __a->__a_value);
   bool __ret = __temp == *__expected;
-  if(__ret)
+  if (__ret)
+  {
     __cxx_atomic_assign_volatile(__a->__a_value, __value);
+  }
   else
+  {
     __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
+  }
   __a->__unlock();
   return __ret;
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_atomic_compare_exchange_strong(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                                          _Tp* __expected, _Tp __value, memory_order, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_strong(
+  __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order)
+{
   __a->__lock();
   bool __ret = __a->__a_value == *__expected;
-  if(__ret)
+  if (__ret)
+  {
     __a->__a_value = __value;
+  }
   else
+  {
     *__expected = __a->__a_value;
+  }
   __a->__unlock();
   return __ret;
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_atomic_compare_exchange_weak(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                                        _Tp* __expected, _Tp __value, memory_order, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak(
+  volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order)
+{
   __a->__lock();
   _Tp __temp;
   __cxx_atomic_assign_volatile(__temp, __a->__a_value);
   bool __ret = __temp == *__expected;
-  if(__ret)
+  if (__ret)
+  {
     __cxx_atomic_assign_volatile(__a->__a_value, __value);
+  }
   else
+  {
     __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
+  }
   __a->__unlock();
   return __ret;
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool __cxx_atomic_compare_exchange_weak(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                                        _Tp* __expected, _Tp __value, memory_order, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY bool __cxx_atomic_compare_exchange_weak(
+  __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order)
+{
   __a->__lock();
   bool __ret = __a->__a_value == *__expected;
-  if(__ret)
+  if (__ret)
+  {
     __a->__a_value = __value;
+  }
   else
+  {
     *__expected = __a->__a_value;
+  }
   __a->__unlock();
   return __ret;
 }
 
 template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Td __delta, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Td __delta, memory_order)
+{
   __a->__lock();
   _Tp __old;
   __cxx_atomic_assign_volatile(__old, __a->__a_value);
@@ -952,9 +983,9 @@ _Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
   return __old;
 }
 template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Td __delta, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Td __delta, memory_order)
+{
   __a->__lock();
   _Tp __old = __a->__a_value;
   __a->__a_value += __delta;
@@ -963,9 +994,9 @@ _Tp __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
 }
 
 template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp* __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp*, _Sco>* __a,
-                           ptrdiff_t __delta, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+__cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp*, _Sco>* __a, ptrdiff_t __delta, memory_order)
+{
   __a->__lock();
   _Tp* __old;
   __cxx_atomic_assign_volatile(__old, __a->__a_value);
@@ -974,9 +1005,9 @@ _Tp* __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp*, _Sco>* __a,
   return __old;
 }
 template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp* __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp*, _Sco>* __a,
-                            ptrdiff_t __delta, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+__cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp*, _Sco>* __a, ptrdiff_t __delta, memory_order)
+{
   __a->__lock();
   _Tp* __old = __a->__a_value;
   __a->__a_value += __delta;
@@ -985,9 +1016,9 @@ _Tp* __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp*, _Sco>* __a,
 }
 
 template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Td __delta, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_sub(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Td __delta, memory_order)
+{
   __a->__lock();
   _Tp __old;
   __cxx_atomic_assign_volatile(__old, __a->__a_value);
@@ -996,9 +1027,9 @@ _Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
   return __old;
 }
 template <typename _Tp, typename _Td, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_sub(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Td __delta, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_sub(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Td __delta, memory_order)
+{
   __a->__lock();
   _Tp __old = __a->__a_value;
   __a->__a_value -= __delta;
@@ -1007,9 +1038,9 @@ _Tp __cxx_atomic_fetch_sub(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_and(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Tp __pattern, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_and(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
+{
   __a->__lock();
   _Tp __old;
   __cxx_atomic_assign_volatile(__old, __a->__a_value);
@@ -1018,9 +1049,9 @@ _Tp __cxx_atomic_fetch_and(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
   return __old;
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_and(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Tp __pattern, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_and(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
+{
   __a->__lock();
   _Tp __old = __a->__a_value;
   __a->__a_value &= __pattern;
@@ -1029,9 +1060,9 @@ _Tp __cxx_atomic_fetch_and(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_or(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                          _Tp __pattern, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_or(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
+{
   __a->__lock();
   _Tp __old;
   __cxx_atomic_assign_volatile(__old, __a->__a_value);
@@ -1040,9 +1071,9 @@ _Tp __cxx_atomic_fetch_or(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
   return __old;
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_or(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                          _Tp __pattern, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_or(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
+{
   __a->__lock();
   _Tp __old = __a->__a_value;
   __a->__a_value |= __pattern;
@@ -1051,9 +1082,9 @@ _Tp __cxx_atomic_fetch_or(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
 }
 
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_xor(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Tp __pattern, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_xor(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
+{
   __a->__lock();
   _Tp __old;
   __cxx_atomic_assign_volatile(__old, __a->__a_value);
@@ -1062,9 +1093,9 @@ _Tp __cxx_atomic_fetch_xor(volatile __cxx_atomic_lock_impl<_Tp, _Sco>* __a,
   return __old;
 }
 template <typename _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp __cxx_atomic_fetch_xor(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
-                           _Tp __pattern, memory_order) {
+_LIBCUDACXX_INLINE_VISIBILITY _Tp
+__cxx_atomic_fetch_xor(__cxx_atomic_lock_impl<_Tp, _Sco>* __a, _Tp __pattern, memory_order)
+{
   __a->__lock();
   _Tp __old = __a->__a_value;
   __a->__a_value ^= __pattern;
@@ -1072,44 +1103,56 @@ _Tp __cxx_atomic_fetch_xor(__cxx_atomic_lock_impl<_Tp, _Sco>* __a,
   return __old;
 }
 
-#if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+#  if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
-template<typename _Tp> struct __cxx_is_always_lock_free {
-    enum { __value = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0) }; };
+template <typename _Tp>
+struct __cxx_is_always_lock_free
+{
+  enum
+  {
+    __value = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0)
+  };
+};
 
-#else
+#  else
 
-template<typename _Tp> struct __cxx_is_always_lock_free {
-    enum { __value = sizeof(_Tp) <= 8 }; };
+template <typename _Tp>
+struct __cxx_is_always_lock_free
+{
+  enum
+  {
+    __value = sizeof(_Tp) <= 8
+  };
+};
 
-#endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
+#  endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
 template <typename _Tp, int _Sco>
-struct __cxx_atomic_impl_conditional {
-    using type = __conditional_t<__cxx_is_always_lock_free<_Tp>::__value,
-                                                __cxx_atomic_base_impl<_Tp, _Sco>,
-                                                __cxx_atomic_lock_impl<_Tp, _Sco> >;
+struct __cxx_atomic_impl_conditional
+{
+  using type = __conditional_t<__cxx_is_always_lock_free<_Tp>::__value,
+                               __cxx_atomic_base_impl<_Tp, _Sco>,
+                               __cxx_atomic_lock_impl<_Tp, _Sco>>;
 };
 
-template <typename _Tp, int _Sco,
-          typename _Base = typename __cxx_atomic_impl_conditional<_Tp, _Sco>::type >
+template <typename _Tp, int _Sco, typename _Base = typename __cxx_atomic_impl_conditional<_Tp, _Sco>::type>
 #else
-template <typename _Tp, int _Sco,
-          typename _Base = __cxx_atomic_base_impl<_Tp, _Sco> >
+template <typename _Tp, int _Sco, typename _Base = __cxx_atomic_base_impl<_Tp, _Sco>>
 #endif //_LIBCUDACXX_ATOMIC_ONLY_USE_BUILTINS
-struct __cxx_atomic_impl : public _Base {
+struct __cxx_atomic_impl : public _Base
+{
   __cxx_atomic_impl() noexcept = default;
   _LIBCUDACXX_INLINE_VISIBILITY constexpr explicit __cxx_atomic_impl(_Tp value) noexcept
-    : _Base(value) {}
+      : _Base(value)
+  {}
 };
 
-
-template<int _Sco, typename _Tp = int>
-_LIBCUDACXX_INLINE_VISIBILITY
-__cxx_atomic_impl<_Tp, _Sco>* __cxx_atomic_rebind(_Tp* __inst) {
-    static_assert(sizeof(__cxx_atomic_impl<_Tp, _Sco>) == sizeof(_Tp),"");
-    static_assert(alignof(__cxx_atomic_impl<_Tp, _Sco>) == alignof(_Tp),"");
-    return (__cxx_atomic_impl<_Tp, _Sco>*)__inst;
+template <int _Sco, typename _Tp = int>
+_LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_impl<_Tp, _Sco>* __cxx_atomic_rebind(_Tp* __inst)
+{
+  static_assert(sizeof(__cxx_atomic_impl<_Tp, _Sco>) == sizeof(_Tp), "");
+  static_assert(alignof(__cxx_atomic_impl<_Tp, _Sco>) == alignof(_Tp), "");
+  return (__cxx_atomic_impl<_Tp, _Sco>*) __inst;
 }
 
 template <typename _Tp, int _Sco>
@@ -1118,25 +1161,29 @@ using __cxx_atomic_ref_impl = __cxx_atomic_ref_base_impl<_Tp, _Sco>;
 #ifdef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
 
 template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>, int _Sco = _Ty::__sco>
-struct __cxx_atomic_poll_tester {
-    _Ty const volatile* __a;
-    _Tp __val;
-    memory_order __order;
+struct __cxx_atomic_poll_tester
+{
+  _Ty const volatile* __a;
+  _Tp __val;
+  memory_order __order;
 
-    _LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_poll_tester(_Ty const volatile* __a_, _Tp __val_, memory_order __order_)
+  _LIBCUDACXX_INLINE_VISIBILITY __cxx_atomic_poll_tester(_Ty const volatile* __a_, _Tp __val_, memory_order __order_)
       : __a(__a_)
       , __val(__val_)
       , __order(__order_)
-    {}
+  {}
 
-    _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const {
-      return !(__cxx_atomic_load(__a, __order) == __val);
-    }
+  _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const
+  {
+    return !(__cxx_atomic_load(__a, __order) == __val);
+  }
 };
 
 template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>, int _Sco = _Ty::__sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow_fallback(_Ty const volatile* __a, _Tp __val, memory_order __order) {
-    __libcpp_thread_poll_with_backoff(__cxx_atomic_poll_tester<_Ty>(__a, __val, __order));
+_LIBCUDACXX_INLINE_VISIBILITY void
+__cxx_atomic_try_wait_slow_fallback(_Ty const volatile* __a, _Tp __val, memory_order __order)
+{
+  __libcpp_thread_poll_with_backoff(__cxx_atomic_poll_tester<_Ty>(__a, __val, __order));
 }
 
 #endif
@@ -1144,632 +1191,888 @@ _LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow_fallback(_Ty const
 #ifdef _LIBCUDACXX_HAS_PLATFORM_WAIT
 
 template <class _Tp, int _Sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__version), (__libcpp_platform_wait_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)0, memory_order_relaxed))
-        __libcpp_platform_wake(&__c->__version, true);
-#endif
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
+{
+#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+  auto* const __c = __libcpp_contention_state(__a);
+  __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__version), (__libcpp_platform_wait_t) 1, memory_order_relaxed);
+  __cxx_atomic_thread_fence(memory_order_seq_cst);
+  if (0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t) 0, memory_order_relaxed))
+  {
+    __libcpp_platform_wake(&__c->__version, true);
+  }
+#  endif
 }
 template <class _Tp, int _Sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-    __cxx_atomic_notify_all(__a);
-}
-template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>, int _Sco = _Ty::__sco, __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp const __val, memory_order __order) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    auto const __version = __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__version), memory_order_relaxed);
-    if (!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-        return;
-    if(sizeof(__libcpp_platform_wait_t) < 8) {
-        constexpr timespec __timeout = { 2, 0 }; // Hedge on rare 'int version' aliasing.
-        __libcpp_platform_wait(&__c->__version, __version, &__timeout);
-    }
-    else
-        __libcpp_platform_wait(&__c->__version, __version, nullptr);
-#else
-    __cxx_atomic_try_wait_slow_fallback(__a, __val, __order);
-#endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
+{
+  __cxx_atomic_notify_all(__a);
+}
+template <class _Ty,
+          class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>,
+          int _Sco  = _Ty::__sco,
+          __enable_if_t<!__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
+_LIBCUDACXX_INLINE_VISIBILITY void
+__cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp const __val, memory_order __order)
+{
+#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+  auto* const __c = __libcpp_contention_state(__a);
+  __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t) 1, memory_order_relaxed);
+  __cxx_atomic_thread_fence(memory_order_seq_cst);
+  auto const __version = __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__version), memory_order_relaxed);
+  if (!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
+  {
+    return;
+  }
+  if (sizeof(__libcpp_platform_wait_t) < 8)
+  {
+    constexpr timespec __timeout = {2, 0}; // Hedge on rare 'int version' aliasing.
+    __libcpp_platform_wait(&__c->__version, __version, &__timeout);
+  }
+  else
+  {
+    __libcpp_platform_wait(&__c->__version, __version, nullptr);
+  }
+#  else
+  __cxx_atomic_try_wait_slow_fallback(__a, __val, __order);
+#  endif // _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
 }
 
 template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp __val, memory_order) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-#endif
-    __libcpp_platform_wait((_Tp*)__a, __val, nullptr);
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    __cxx_atomic_fetch_sub(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t)1, memory_order_relaxed);
-#endif
+_LIBCUDACXX_INLINE_VISIBILITY void
+__cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp __val, memory_order)
+{
+#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+  auto* const __c = __libcpp_contention_state(__a);
+  __cxx_atomic_fetch_add(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t) 1, memory_order_relaxed);
+  __cxx_atomic_thread_fence(memory_order_seq_cst);
+#  endif
+  __libcpp_platform_wait((_Tp*) __a, __val, nullptr);
+#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+  __cxx_atomic_fetch_sub(__cxx_atomic_rebind<_Sco>(&__c->__waiters), (ptrdiff_t) 1, memory_order_relaxed);
+#  endif
 }
 template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
-#endif
-        __libcpp_platform_wake((_Tp*)__a, true);
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
+{
+#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+  auto* const __c = __libcpp_contention_state(__a);
+  __cxx_atomic_thread_fence(memory_order_seq_cst);
+  if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
+#  endif
+    __libcpp_platform_wake((_Tp*) __a, true);
 }
 template <class _Tp, int _Sco, __enable_if_t<__libcpp_platform_wait_uses_type<_Tp>::__value, int> = 1>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-#ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
-#endif
-        __libcpp_platform_wake((_Tp*)__a, false);
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
+{
+#  ifndef _LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE
+  auto* const __c = __libcpp_contention_state(__a);
+  __cxx_atomic_thread_fence(memory_order_seq_cst);
+  if (0 != __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__waiters), memory_order_relaxed))
+#  endif
+    __libcpp_platform_wake((_Tp*) __a, false);
 }
 
 #elif !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE)
 
 template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-    auto * const __c = __libcpp_contention_state(__a);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if(0 == __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__credit), memory_order_relaxed))
-        return;
-    if(0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t)0, memory_order_relaxed)) {
-        __libcpp_mutex_lock(&__c->__mutex);
-        __libcpp_mutex_unlock(&__c->__mutex);
-        __libcpp_condvar_broadcast(&__c->__condvar);
-    }
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
+{
+  auto* const __c = __libcpp_contention_state(__a);
+  __cxx_atomic_thread_fence(memory_order_seq_cst);
+  if (0 == __cxx_atomic_load(__cxx_atomic_rebind<_Sco>(&__c->__credit), memory_order_relaxed))
+  {
+    return;
+  }
+  if (0 != __cxx_atomic_exchange(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t) 0, memory_order_relaxed))
+  {
+    __libcpp_mutex_lock(&__c->__mutex);
+    __libcpp_mutex_unlock(&__c->__mutex);
+    __libcpp_condvar_broadcast(&__c->__condvar);
+  }
 }
 template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a) {
-    __cxx_atomic_notify_all(__a);
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a)
+{
+  __cxx_atomic_notify_all(__a);
 }
 template <class _Tp, int _Sco>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp const __val, memory_order __order) {
-    auto * const __c = __libcpp_contention_state(__a);
-    __libcpp_mutex_lock(&__c->__mutex);
-    __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t)1, memory_order_relaxed);
-    __cxx_atomic_thread_fence(memory_order_seq_cst);
-    if (__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-        __libcpp_condvar_wait(&__c->__condvar, &__c->__mutex);
-    __libcpp_mutex_unlock(&__c->__mutex);
+_LIBCUDACXX_INLINE_VISIBILITY void
+__cxx_atomic_try_wait_slow(__cxx_atomic_impl<_Tp, _Sco> const volatile* __a, _Tp const __val, memory_order __order)
+{
+  auto* const __c = __libcpp_contention_state(__a);
+  __libcpp_mutex_lock(&__c->__mutex);
+  __cxx_atomic_store(__cxx_atomic_rebind<_Sco>(&__c->__credit), (ptrdiff_t) 1, memory_order_relaxed);
+  __cxx_atomic_thread_fence(memory_order_seq_cst);
+  if (__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
+  {
+    __libcpp_condvar_wait(&__c->__condvar, &__c->__mutex);
+  }
+  __libcpp_mutex_unlock(&__c->__mutex);
 }
 
 #else
 
-template<typename T>
+template <typename T>
 struct __atomic_wait_and_notify_supported
-#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700
+#  if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 700
     : false_type
-#else
+#  else
     : true_type
-#endif
+#  endif
 {};
 
 template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp __val, memory_order __order) {
-    static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic wait operations are unsupported on Pascal");
-    __cxx_atomic_try_wait_slow_fallback(__a, __val, __order);
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_try_wait_slow(_Ty const volatile* __a, _Tp __val, memory_order __order)
+{
+  static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic wait operations are unsupported on Pascal");
+  __cxx_atomic_try_wait_slow_fallback(__a, __val, __order);
 }
 
 template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(_Ty const volatile*) {
-    static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic notify-one operations are unsupported on Pascal");
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_one(_Ty const volatile*)
+{
+  static_assert(__atomic_wait_and_notify_supported<_Tp>::value,
+                "atomic notify-one operations are unsupported on Pascal");
 }
 
 template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(_Ty const volatile*) {
-    static_assert(__atomic_wait_and_notify_supported<_Tp>::value, "atomic notify-all operations are unsupported on Pascal");
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_notify_all(_Ty const volatile*)
+{
+  static_assert(__atomic_wait_and_notify_supported<_Tp>::value,
+                "atomic notify-all operations are unsupported on Pascal");
 }
 
 #endif // _LIBCUDACXX_HAS_PLATFORM_WAIT || !defined(_LIBCUDACXX_HAS_NO_THREAD_CONTENTION_TABLE)
 
 template <class _Ty, class _Tp = __detail::__cxx_atomic_underlying_t<_Ty>>
-_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_wait(_Ty const volatile* __a, _Tp const __val, memory_order __order) {
-    for(int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i) {
-        if(!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-            return;
-        if(__i < 12)
-            __libcpp_thread_yield_processor();
-        else
-            __libcpp_thread_yield();
+_LIBCUDACXX_INLINE_VISIBILITY void __cxx_atomic_wait(_Ty const volatile* __a, _Tp const __val, memory_order __order)
+{
+  for (int __i = 0; __i < _LIBCUDACXX_POLLING_COUNT; ++__i)
+  {
+    if (!__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
+    {
+      return;
+    }
+    if (__i < 12)
+    {
+      __libcpp_thread_yield_processor();
     }
-    while(__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
-        __cxx_atomic_try_wait_slow(__a, __val, __order);
+    else
+    {
+      __libcpp_thread_yield();
+    }
+  }
+  while (__cxx_nonatomic_compare_equal(__cxx_atomic_load(__a, __order), __val))
+  {
+    __cxx_atomic_try_wait_slow(__a, __val, __order);
+  }
 }
 
 template <class _Tp, typename _Storage>
-struct __atomic_base_storage {
-    mutable _Storage __a_;
+struct __atomic_base_storage
+{
+  mutable _Storage __a_;
 
-    __atomic_base_storage() = default;
-    __atomic_base_storage(const __atomic_base_storage&) = default;
-    __atomic_base_storage(__atomic_base_storage&&) = default;
+  __atomic_base_storage()                             = default;
+  __atomic_base_storage(const __atomic_base_storage&) = default;
+  __atomic_base_storage(__atomic_base_storage&&)      = default;
 
-    __atomic_base_storage& operator=(const __atomic_base_storage&) = default;
-    __atomic_base_storage& operator=(__atomic_base_storage&&) = default;
+  __atomic_base_storage& operator=(const __atomic_base_storage&) = default;
+  __atomic_base_storage& operator=(__atomic_base_storage&&)      = default;
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_storage(_Storage&& __a) noexcept : __a_(_CUDA_VSTD::forward<_Storage>(__a)) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_storage(_Storage&& __a) noexcept
+      : __a_(_CUDA_VSTD::forward<_Storage>(__a))
+  {}
 };
 
 template <class _Tp, bool _Cq, typename _Storage>
-struct __atomic_base_core : public __atomic_base_storage<_Tp, _Storage>{
-    __atomic_base_core() = default;
-    __atomic_base_core(const __atomic_base_core&) = delete;
-    __atomic_base_core(__atomic_base_core&&) = delete;
+struct __atomic_base_core : public __atomic_base_storage<_Tp, _Storage>
+{
+  __atomic_base_core()                          = default;
+  __atomic_base_core(const __atomic_base_core&) = delete;
+  __atomic_base_core(__atomic_base_core&&)      = delete;
 
-    __atomic_base_core& operator=(const __atomic_base_core&) = delete;
-    __atomic_base_core& operator=(__atomic_base_core&&) = delete;
+  __atomic_base_core& operator=(const __atomic_base_core&) = delete;
+  __atomic_base_core& operator=(__atomic_base_core&&)      = delete;
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_core(_Storage&& __a) noexcept : __atomic_base_storage<_Tp, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_core(_Storage&& __a) noexcept
+      : __atomic_base_storage<_Tp, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
+  {}
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
 #endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool is_lock_free() const volatile noexcept
-        {return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool is_lock_free() const noexcept
-        {return static_cast<__atomic_base_core const volatile*>(this)->is_lock_free();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__cxx_atomic_store(&this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__cxx_atomic_store(&this->__a_, __d, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __cxx_atomic_load(&this->__a_, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __cxx_atomic_load(&this->__a_, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    operator _Tp() const volatile noexcept {return load();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    operator _Tp() const noexcept          {return load();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_exchange(&this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_exchange(&this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) volatile noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) volatile noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                              memory_order __m = memory_order_seq_cst) volatile noexcept {
-        if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+  _LIBCUDACXX_INLINE_VISIBILITY bool is_lock_free() const volatile noexcept
+  {
+    return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool is_lock_free() const noexcept
+  {
+    return static_cast<__atomic_base_core const volatile*>(this)->is_lock_free();
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY
+
+    void
+    store(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
+  {
+    __cxx_atomic_store(&this->__a_, __d, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void store(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
+    _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
+  {
+    __cxx_atomic_store(&this->__a_, __d, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
+    _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+  {
+    return __cxx_atomic_load(&this->__a_, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
+    _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+  {
+    return __cxx_atomic_load(&this->__a_, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const volatile noexcept
+  {
+    return load();
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const noexcept
+  {
+    return load();
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_exchange(&this->__a_, __d, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_exchange(&this->__a_, __d, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) volatile noexcept
+    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) noexcept
+    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) volatile noexcept
+    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) noexcept
+    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
+    }
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+    }
+    else
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+    }
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __m = memory_order_seq_cst) noexcept {
-        if(memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if(memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                              memory_order __m = memory_order_seq_cst) volatile noexcept {
-        if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+    else
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+    }
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
+    }
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+    }
+    else
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+    }
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __m = memory_order_seq_cst) noexcept {
-        if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
     }
+    else
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+    }
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__cxx_atomic_wait(&this->__a_, __v, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__cxx_atomic_wait(&this->__a_, __v, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept
-        {__cxx_atomic_notify_one(&this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept
-        {__cxx_atomic_notify_one(&this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept
-        {__cxx_atomic_notify_all(&this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept
-        {__cxx_atomic_notify_all(&this->__a_);}
+  _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    __cxx_atomic_wait(&this->__a_, __v, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    __cxx_atomic_wait(&this->__a_, __v, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept
+  {
+    __cxx_atomic_notify_one(&this->__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept
+  {
+    __cxx_atomic_notify_one(&this->__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept
+  {
+    __cxx_atomic_notify_all(&this->__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept
+  {
+    __cxx_atomic_notify_all(&this->__a_);
+  }
 };
 
 template <class _Tp, typename _Storage>
-struct __atomic_base_core<_Tp, true, _Storage> : public __atomic_base_storage<_Tp, _Storage>{
-    __atomic_base_core() = default;
-    __atomic_base_core(const __atomic_base_core&) = default;
-    __atomic_base_core(__atomic_base_core&&) = default;
+struct __atomic_base_core<_Tp, true, _Storage> : public __atomic_base_storage<_Tp, _Storage>
+{
+  __atomic_base_core()                          = default;
+  __atomic_base_core(const __atomic_base_core&) = default;
+  __atomic_base_core(__atomic_base_core&&)      = default;
 
-    __atomic_base_core& operator=(const __atomic_base_core&) = default;
-    __atomic_base_core& operator=(__atomic_base_core&&) = default;
+  __atomic_base_core& operator=(const __atomic_base_core&) = default;
+  __atomic_base_core& operator=(__atomic_base_core&&)      = default;
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_core(_Storage&& __a) noexcept : __atomic_base_storage<_Tp, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_core(_Storage&& __a) noexcept
+      : __atomic_base_storage<_Tp, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
+  {}
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
-    static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
+  static constexpr bool is_always_lock_free = _LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE(sizeof(_Tp), 0);
 #endif // defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool is_lock_free() const volatile noexcept
-        {return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool is_lock_free() const noexcept
-        {return static_cast<__atomic_base_core const volatile*>(this)->is_lock_free();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__cxx_atomic_store(&this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void store(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
-      _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
-        {__cxx_atomic_store(&this->__a_, __d, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __cxx_atomic_load(&this->__a_, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
-      _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
-        {return __cxx_atomic_load(&this->__a_, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    operator _Tp() const volatile noexcept {return load();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    operator _Tp() const noexcept          {return load();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_exchange(&this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_exchange(&this->__a_, __d, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) const volatile noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __s, memory_order __f) const noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) const volatile noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __s, memory_order __f) const noexcept
-      _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
-        {return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                              memory_order __m = memory_order_seq_cst) const volatile noexcept {
-        if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+  _LIBCUDACXX_INLINE_VISIBILITY bool is_lock_free() const volatile noexcept
+  {
+    return _LIBCUDACXX_ATOMIC_IS_LOCK_FREE(sizeof(_Tp));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool is_lock_free() const noexcept
+  {
+    return static_cast<__atomic_base_core const volatile*>(this)->is_lock_free();
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY
+
+    void
+    store(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
+    _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
+  {
+    __cxx_atomic_store(&this->__a_, __d, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void store(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
+    _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
+  {
+    __cxx_atomic_store(&this->__a_, __d, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const volatile noexcept
+    _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+  {
+    return __cxx_atomic_load(&this->__a_, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp load(memory_order __m = memory_order_seq_cst) const noexcept
+    _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
+  {
+    return __cxx_atomic_load(&this->__a_, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const volatile noexcept
+  {
+    return load();
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY operator _Tp() const noexcept
+  {
+    return load();
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return __cxx_atomic_exchange(&this->__a_, __d, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp exchange(_Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_exchange(&this->__a_, __d, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const
+    volatile noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const noexcept
+    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const
+    volatile noexcept _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __s, memory_order __f) const noexcept
+    _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
+  {
+    return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __s, __f);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
+    }
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+    }
+    else
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+    }
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_weak(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
+    }
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_weak(_Tp& __e, _Tp __d,
-                               memory_order __m = memory_order_seq_cst) const noexcept {
-        if(memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if(memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+    else
+    {
+      return __cxx_atomic_compare_exchange_weak(&this->__a_, &__e, __d, __m, __m);
+    }
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                              memory_order __m = memory_order_seq_cst) const volatile noexcept {
-        if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
     }
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool compare_exchange_strong(_Tp& __e, _Tp __d,
-                                 memory_order __m = memory_order_seq_cst) const noexcept {
-        if (memory_order_acq_rel == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
-        else if (memory_order_release == __m)
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
-        else
-            return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+    else
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
     }
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool
+  compare_exchange_strong(_Tp& __e, _Tp __d, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    if (memory_order_acq_rel == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_acquire);
+    }
+    else if (memory_order_release == __m)
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, memory_order_relaxed);
+    }
+    else
+    {
+      return __cxx_atomic_compare_exchange_strong(&this->__a_, &__e, __d, __m, __m);
+    }
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__cxx_atomic_wait(&this->__a_, __v, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__cxx_atomic_wait(&this->__a_, __v, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const volatile noexcept
-        {__cxx_atomic_notify_one(&this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const noexcept
-        {__cxx_atomic_notify_one(&this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const volatile noexcept
-        {__cxx_atomic_notify_all(&this->__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const noexcept
-        {__cxx_atomic_notify_all(&this->__a_);}
+  _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    __cxx_atomic_wait(&this->__a_, __v, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void wait(_Tp __v, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    __cxx_atomic_wait(&this->__a_, __v, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const volatile noexcept
+  {
+    __cxx_atomic_notify_one(&this->__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() const noexcept
+  {
+    __cxx_atomic_notify_one(&this->__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const volatile noexcept
+  {
+    __cxx_atomic_notify_all(&this->__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() const noexcept
+  {
+    __cxx_atomic_notify_all(&this->__a_);
+  }
 };
 
 template <class _Tp, bool _Cq, typename _Storage>
-struct __atomic_base_arithmetic : public __atomic_base_core<_Tp, _Cq, _Storage> {
-    __atomic_base_arithmetic() = default;
-    __atomic_base_arithmetic(const __atomic_base_arithmetic&) = delete;
-    __atomic_base_arithmetic(__atomic_base_arithmetic&&) = delete;
-
-    __atomic_base_arithmetic& operator=(const __atomic_base_arithmetic&) = delete;
-    __atomic_base_arithmetic& operator=(__atomic_base_arithmetic&&) = delete;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_arithmetic(_Storage&& __a) noexcept : __atomic_base_core<_Tp, _Cq, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++(int) volatile noexcept      {return fetch_add(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++(int) noexcept               {return fetch_add(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--(int) volatile noexcept      {return fetch_sub(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--(int) noexcept               {return fetch_sub(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++() volatile noexcept         {return fetch_add(_Tp(1)) + _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++() noexcept                  {return fetch_add(_Tp(1)) + _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--() volatile noexcept         {return fetch_sub(_Tp(1)) - _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--() noexcept                  {return fetch_sub(_Tp(1)) - _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator+=(_Tp __op) volatile noexcept {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator+=(_Tp __op) noexcept          {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator-=(_Tp __op) volatile noexcept {return fetch_sub(__op) - __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator-=(_Tp __op) noexcept          {return fetch_sub(__op) - __op;}
+struct __atomic_base_arithmetic : public __atomic_base_core<_Tp, _Cq, _Storage>
+{
+  __atomic_base_arithmetic()                                = default;
+  __atomic_base_arithmetic(const __atomic_base_arithmetic&) = delete;
+  __atomic_base_arithmetic(__atomic_base_arithmetic&&)      = delete;
+
+  __atomic_base_arithmetic& operator=(const __atomic_base_arithmetic&) = delete;
+  __atomic_base_arithmetic& operator=(__atomic_base_arithmetic&&)      = delete;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_arithmetic(_Storage&& __a) noexcept
+      : __atomic_base_core<_Tp, _Cq, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++(int) volatile noexcept
+  {
+    return fetch_add(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++(int) noexcept
+  {
+    return fetch_add(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--(int) volatile noexcept
+  {
+    return fetch_sub(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--(int) noexcept
+  {
+    return fetch_sub(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++() volatile noexcept
+  {
+    return fetch_add(_Tp(1)) + _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++() noexcept
+  {
+    return fetch_add(_Tp(1)) + _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--() volatile noexcept
+  {
+    return fetch_sub(_Tp(1)) - _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--() noexcept
+  {
+    return fetch_sub(_Tp(1)) - _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator+=(_Tp __op) volatile noexcept
+  {
+    return fetch_add(__op) + __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator+=(_Tp __op) noexcept
+  {
+    return fetch_add(__op) + __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator-=(_Tp __op) volatile noexcept
+  {
+    return fetch_sub(__op) - __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator-=(_Tp __op) noexcept
+  {
+    return fetch_sub(__op) - __op;
+  }
 };
 
 template <class _Tp, typename _Storage>
-struct __atomic_base_arithmetic<_Tp, true, _Storage> : public __atomic_base_core<_Tp, true, _Storage> {
-    __atomic_base_arithmetic() = default;
-    __atomic_base_arithmetic(const __atomic_base_arithmetic&) = default;
-    __atomic_base_arithmetic(__atomic_base_arithmetic&&) = default;
-
-    __atomic_base_arithmetic& operator=(const __atomic_base_arithmetic&) = default;
-    __atomic_base_arithmetic& operator=(__atomic_base_arithmetic&&) = default;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_arithmetic(_Storage&& __a) noexcept : __atomic_base_core<_Tp, true, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++(int) const volatile noexcept      {return fetch_add(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++(int) const noexcept               {return fetch_add(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--(int) const volatile noexcept      {return fetch_sub(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--(int) const noexcept               {return fetch_sub(_Tp(1));}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++() const volatile noexcept         {return fetch_add(_Tp(1)) + _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator++() const noexcept                  {return fetch_add(_Tp(1)) + _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--() const volatile noexcept         {return fetch_sub(_Tp(1)) - _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator--() const noexcept                  {return fetch_sub(_Tp(1)) - _Tp(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator+=(_Tp __op) const volatile noexcept {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator+=(_Tp __op) const noexcept          {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator-=(_Tp __op) const volatile noexcept {return fetch_sub(__op) - __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator-=(_Tp __op) const noexcept          {return fetch_sub(__op) - __op;}
+struct __atomic_base_arithmetic<_Tp, true, _Storage> : public __atomic_base_core<_Tp, true, _Storage>
+{
+  __atomic_base_arithmetic()                                = default;
+  __atomic_base_arithmetic(const __atomic_base_arithmetic&) = default;
+  __atomic_base_arithmetic(__atomic_base_arithmetic&&)      = default;
+
+  __atomic_base_arithmetic& operator=(const __atomic_base_arithmetic&) = default;
+  __atomic_base_arithmetic& operator=(__atomic_base_arithmetic&&)      = default;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_arithmetic(_Storage&& __a) noexcept
+      : __atomic_base_core<_Tp, true, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_add(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_sub(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++(int) const volatile noexcept
+  {
+    return fetch_add(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++(int) const noexcept
+  {
+    return fetch_add(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--(int) const volatile noexcept
+  {
+    return fetch_sub(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--(int) const noexcept
+  {
+    return fetch_sub(_Tp(1));
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++() const volatile noexcept
+  {
+    return fetch_add(_Tp(1)) + _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator++() const noexcept
+  {
+    return fetch_add(_Tp(1)) + _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--() const volatile noexcept
+  {
+    return fetch_sub(_Tp(1)) - _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator--() const noexcept
+  {
+    return fetch_sub(_Tp(1)) - _Tp(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator+=(_Tp __op) const volatile noexcept
+  {
+    return fetch_add(__op) + __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator+=(_Tp __op) const noexcept
+  {
+    return fetch_add(__op) + __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator-=(_Tp __op) const volatile noexcept
+  {
+    return fetch_sub(__op) - __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator-=(_Tp __op) const noexcept
+  {
+    return fetch_sub(__op) - __op;
+  }
 };
 
 template <class _Tp, bool _Cq, typename _Storage>
-struct __atomic_base_bitwise : public __atomic_base_arithmetic<_Tp, _Cq, _Storage> {
-    __atomic_base_bitwise() = default;
-    __atomic_base_bitwise(const __atomic_base_bitwise&) = delete;
-    __atomic_base_bitwise(__atomic_base_bitwise&&) = delete;
-
-    __atomic_base_bitwise& operator=(const __atomic_base_bitwise&) = delete;
-    __atomic_base_bitwise& operator=(__atomic_base_bitwise&&) = delete;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_bitwise(_Storage&& __a) noexcept : __atomic_base_arithmetic<_Tp, _Cq, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator&=(_Tp __op) volatile noexcept {return fetch_and(__op) & __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator&=(_Tp __op) noexcept          {return fetch_and(__op) & __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator|=(_Tp __op) volatile noexcept {return fetch_or(__op) | __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator|=(_Tp __op) noexcept          {return fetch_or(__op) | __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator^=(_Tp __op) volatile noexcept {return fetch_xor(__op) ^ __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator^=(_Tp __op) noexcept          {return fetch_xor(__op) ^ __op;}
+struct __atomic_base_bitwise : public __atomic_base_arithmetic<_Tp, _Cq, _Storage>
+{
+  __atomic_base_bitwise()                             = default;
+  __atomic_base_bitwise(const __atomic_base_bitwise&) = delete;
+  __atomic_base_bitwise(__atomic_base_bitwise&&)      = delete;
+
+  __atomic_base_bitwise& operator=(const __atomic_base_bitwise&) = delete;
+  __atomic_base_bitwise& operator=(__atomic_base_bitwise&&)      = delete;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_bitwise(_Storage&& __a) noexcept
+      : __atomic_base_arithmetic<_Tp, _Cq, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_fetch_and(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_fetch_and(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_fetch_or(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_fetch_or(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator&=(_Tp __op) volatile noexcept
+  {
+    return fetch_and(__op) & __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator&=(_Tp __op) noexcept
+  {
+    return fetch_and(__op) & __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator|=(_Tp __op) volatile noexcept
+  {
+    return fetch_or(__op) | __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator|=(_Tp __op) noexcept
+  {
+    return fetch_or(__op) | __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator^=(_Tp __op) volatile noexcept
+  {
+    return fetch_xor(__op) ^ __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator^=(_Tp __op) noexcept
+  {
+    return fetch_xor(__op) ^ __op;
+  }
 };
 
 template <class _Tp, typename _Storage>
-struct __atomic_base_bitwise<_Tp, true, _Storage> : public __atomic_base_arithmetic<_Tp, true, _Storage> {
-    __atomic_base_bitwise() = default;
-    __atomic_base_bitwise(const __atomic_base_bitwise&) = default;
-    __atomic_base_bitwise(__atomic_base_bitwise&&) = default;
-
-    __atomic_base_bitwise& operator=(const __atomic_base_bitwise&) = default;
-    __atomic_base_bitwise& operator=(__atomic_base_bitwise&&) = default;
-
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_bitwise(_Storage&& __a) noexcept : __atomic_base_arithmetic<_Tp, true, _Storage>(_CUDA_VSTD::forward<_Storage>(__a)) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_and(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_or(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
-        {return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator&=(_Tp __op) const volatile noexcept {return fetch_and(__op) & __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator&=(_Tp __op) const noexcept          {return fetch_and(__op) & __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator|=(_Tp __op) const volatile noexcept {return fetch_or(__op) | __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator|=(_Tp __op) const noexcept          {return fetch_or(__op) | __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator^=(_Tp __op) const volatile noexcept {return fetch_xor(__op) ^ __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator^=(_Tp __op) const noexcept          {return fetch_xor(__op) ^ __op;}
+struct __atomic_base_bitwise<_Tp, true, _Storage> : public __atomic_base_arithmetic<_Tp, true, _Storage>
+{
+  __atomic_base_bitwise()                             = default;
+  __atomic_base_bitwise(const __atomic_base_bitwise&) = default;
+  __atomic_base_bitwise(__atomic_base_bitwise&&)      = default;
+
+  __atomic_base_bitwise& operator=(const __atomic_base_bitwise&) = default;
+  __atomic_base_bitwise& operator=(__atomic_base_bitwise&&)      = default;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_bitwise(_Storage&& __a) noexcept
+      : __atomic_base_arithmetic<_Tp, true, _Storage>(_CUDA_VSTD::forward<_Storage>(__a))
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return __cxx_atomic_fetch_and(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_and(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_fetch_and(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return __cxx_atomic_fetch_or(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_or(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_fetch_or(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp fetch_xor(_Tp __op, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_fetch_xor(&this->__a_, __op, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator&=(_Tp __op) const volatile noexcept
+  {
+    return fetch_and(__op) & __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator&=(_Tp __op) const noexcept
+  {
+    return fetch_and(__op) & __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator|=(_Tp __op) const volatile noexcept
+  {
+    return fetch_or(__op) | __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator|=(_Tp __op) const noexcept
+  {
+    return fetch_or(__op) | __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator^=(_Tp __op) const volatile noexcept
+  {
+    return fetch_xor(__op) ^ __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator^=(_Tp __op) const noexcept
+  {
+    return fetch_xor(__op) ^ __op;
+  }
 };
 
 template <typename _Tp, bool _Cq, typename _Storage>
-using __atomic_select_base = __conditional_t<is_floating_point<_Tp>::value,
-                                             __atomic_base_arithmetic<_Tp, _Cq, _Storage>,
-                                             __conditional_t<is_integral<_Tp>::value,
-                                                __atomic_base_bitwise<_Tp, _Cq, _Storage>,
-                                                __atomic_base_core<_Tp, _Cq, _Storage> >>;
+using __atomic_select_base =
+  __conditional_t<is_floating_point<_Tp>::value,
+                  __atomic_base_arithmetic<_Tp, _Cq, _Storage>,
+                  __conditional_t<is_integral<_Tp>::value,
+                                  __atomic_base_bitwise<_Tp, _Cq, _Storage>,
+                                  __atomic_base_core<_Tp, _Cq, _Storage>>>;
 
 template <typename _Tp, int _Sco = 0, typename _Base = __atomic_select_base<_Tp, false, __cxx_atomic_impl<_Tp, _Sco>>>
-struct __atomic_base : public _Base {
-    __atomic_base() = default;
-    __atomic_base(const __atomic_base&) = delete;
-    __atomic_base(__atomic_base&&) = delete;
+struct __atomic_base : public _Base
+{
+  __atomic_base()                     = default;
+  __atomic_base(const __atomic_base&) = delete;
+  __atomic_base(__atomic_base&&)      = delete;
 
-    __atomic_base& operator=(const __atomic_base&) = delete;
-    __atomic_base& operator=(__atomic_base&&) = delete;
+  __atomic_base& operator=(const __atomic_base&) = delete;
+  __atomic_base& operator=(__atomic_base&&)      = delete;
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base(const _Tp& __a) noexcept :
-        _Base(__cxx_atomic_impl<_Tp, _Sco>(__a)) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base(const _Tp& __a) noexcept
+      : _Base(__cxx_atomic_impl<_Tp, _Sco>(__a))
+  {}
 };
 
 template <typename _Tp, int _Sco = 0, typename _Base = __atomic_select_base<_Tp, true, __cxx_atomic_ref_impl<_Tp, _Sco>>>
-struct __atomic_base_ref : public _Base {
-    __atomic_base_ref() = default;
-    __atomic_base_ref(const __atomic_base_ref&) = default;
-    __atomic_base_ref(__atomic_base_ref&&) = default;
+struct __atomic_base_ref : public _Base
+{
+  __atomic_base_ref()                         = default;
+  __atomic_base_ref(const __atomic_base_ref&) = default;
+  __atomic_base_ref(__atomic_base_ref&&)      = default;
 
-    __atomic_base_ref& operator=(const __atomic_base_ref&) = default;
-    __atomic_base_ref& operator=(__atomic_base_ref&&) = default;
+  __atomic_base_ref& operator=(const __atomic_base_ref&) = default;
+  __atomic_base_ref& operator=(__atomic_base_ref&&)      = default;
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    __atomic_base_ref(_Tp& __a) noexcept :
-        _Base(__cxx_atomic_ref_impl<_Tp, _Sco>(__a)) {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __atomic_base_ref(_Tp& __a) noexcept
+      : _Base(__cxx_atomic_ref_impl<_Tp, _Sco>(__a))
+  {}
 };
 
 #if defined(_LIBCUDACXX_ATOMIC_ALWAYS_LOCK_FREE)
@@ -1779,1059 +2082,918 @@ constexpr bool __atomic_base_core<_Tp, _Cq, _Storage>::is_always_lock_free;
 
 // atomic<T>
 template <class _Tp>
-struct atomic
-    : public __atomic_base<_Tp>
+struct atomic : public __atomic_base<_Tp>
 {
-    typedef __atomic_base<_Tp> __base;
-    using value_type = _Tp;
+  typedef __atomic_base<_Tp> __base;
+  using value_type = _Tp;
 
-    atomic() noexcept = default;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr atomic(_Tp __d) noexcept : __base(__d) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator=(_Tp __d) volatile noexcept
-        {__base::store(__d); return __d;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator=(_Tp __d) noexcept
-        {__base::store(__d); return __d;}
+  atomic() noexcept = default;
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic(_Tp __d) noexcept
+      : __base(__d)
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) volatile noexcept
+  {
+    __base::store(__d);
+    return __d;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __d) noexcept
+  {
+    __base::store(__d);
+    return __d;
+  }
 };
 
 // atomic<T*>
 
 template <class _Tp>
-struct atomic<_Tp*>
-    : public __atomic_base<_Tp*>
+struct atomic<_Tp*> : public __atomic_base<_Tp*>
 {
-    typedef __atomic_base<_Tp*> __base;
-    using value_type = _Tp*;
+  typedef __atomic_base<_Tp*> __base;
+  using value_type = _Tp*;
 
-    atomic() noexcept = default;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr atomic(_Tp* __d) noexcept : __base(__d) {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator=(_Tp* __d) volatile noexcept
-        {__base::store(__d); return __d;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator=(_Tp* __d) noexcept
-        {__base::store(__d); return __d;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        volatile noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        volatile noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++(int) volatile noexcept            {return fetch_add(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++(int) noexcept                     {return fetch_add(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--(int) volatile noexcept            {return fetch_sub(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--(int) noexcept                     {return fetch_sub(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++() volatile noexcept               {return fetch_add(1) + 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++() noexcept                        {return fetch_add(1) + 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--() volatile noexcept               {return fetch_sub(1) - 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--() noexcept                        {return fetch_sub(1) - 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator+=(ptrdiff_t __op) volatile noexcept {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator+=(ptrdiff_t __op) noexcept          {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator-=(ptrdiff_t __op) volatile noexcept {return fetch_sub(__op) - __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator-=(ptrdiff_t __op) noexcept          {return fetch_sub(__op) - __op;}
+  atomic() noexcept = default;
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic(_Tp* __d) noexcept
+      : __base(__d)
+  {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator=(_Tp* __d) volatile noexcept
+  {
+    __base::store(__d);
+    return __d;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator=(_Tp* __d) noexcept
+  {
+    __base::store(__d);
+    return __d;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++(int) volatile noexcept
+  {
+    return fetch_add(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++(int) noexcept
+  {
+    return fetch_add(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--(int) volatile noexcept
+  {
+    return fetch_sub(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--(int) noexcept
+  {
+    return fetch_sub(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++() volatile noexcept
+  {
+    return fetch_add(1) + 1;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++() noexcept
+  {
+    return fetch_add(1) + 1;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--() volatile noexcept
+  {
+    return fetch_sub(1) - 1;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--() noexcept
+  {
+    return fetch_sub(1) - 1;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator+=(ptrdiff_t __op) volatile noexcept
+  {
+    return fetch_add(__op) + __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator+=(ptrdiff_t __op) noexcept
+  {
+    return fetch_add(__op) + __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator-=(ptrdiff_t __op) volatile noexcept
+  {
+    return fetch_sub(__op) - __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator-=(ptrdiff_t __op) noexcept
+  {
+    return fetch_sub(__op) - __op;
+  }
 };
 
 // atomic_ref<T>
 
 template <class _Tp>
- struct atomic_ref
-    : public __atomic_base_ref<_Tp>
+struct atomic_ref : public __atomic_base_ref<_Tp>
 {
-    typedef __atomic_base_ref<_Tp> __base;
-    using value_type = _Tp;
+  typedef __atomic_base_ref<_Tp> __base;
+  using value_type = _Tp;
 
-    static constexpr size_t required_alignment = sizeof(_Tp);
+  static constexpr size_t required_alignment = sizeof(_Tp);
 
-    static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
+  static constexpr bool is_always_lock_free = sizeof(_Tp) <= 8;
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    explicit atomic_ref(_Tp& __ref) : __base(__ref) {}
+  _LIBCUDACXX_INLINE_VISIBILITY explicit atomic_ref(_Tp& __ref)
+      : __base(__ref)
+  {}
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp operator=(_Tp __v) const volatile noexcept {__base::store(__v); return __v;}
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp operator=(_Tp __v) const volatile noexcept
+  {
+    __base::store(__v);
+    return __v;
+  }
 };
 
 // atomic_ref<T*>
 
 template <class _Tp>
- struct atomic_ref<_Tp*>
-    : public __atomic_base_ref<_Tp*>
+struct atomic_ref<_Tp*> : public __atomic_base_ref<_Tp*>
 {
-    typedef __atomic_base_ref<_Tp*> __base;
-    using value_type = _Tp*;
+  typedef __atomic_base_ref<_Tp*> __base;
+  using value_type = _Tp*;
 
-    static constexpr size_t required_alignment = sizeof(_Tp*);
+  static constexpr size_t required_alignment = sizeof(_Tp*);
 
-    static constexpr bool is_always_lock_free = sizeof(_Tp*) <= 8;
+  static constexpr bool is_always_lock_free = sizeof(_Tp*) <= 8;
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    explicit atomic_ref(_Tp*& __ref) : __base(__ref) {}
+  _LIBCUDACXX_INLINE_VISIBILITY explicit atomic_ref(_Tp*& __ref)
+      : __base(__ref)
+  {}
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator=(_Tp* __v) const noexcept {__base::store(__v); return __v;}
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator=(_Tp* __v) const noexcept
+  {
+    __base::store(__v);
+    return __v;
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        const noexcept
-        {return __cxx_atomic_fetch_add(&this->__a_, __op, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst)
-                                                                        const noexcept
-        {return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);}
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_add(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_fetch_add(&this->__a_, __op, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* fetch_sub(ptrdiff_t __op, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return __cxx_atomic_fetch_sub(&this->__a_, __op, __m);
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++(int) const noexcept                     {return fetch_add(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--(int) const noexcept                     {return fetch_sub(1);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator++() const noexcept                        {return fetch_add(1) + 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator--() const noexcept                        {return fetch_sub(1) - 1;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator+=(ptrdiff_t __op) const noexcept          {return fetch_add(__op) + __op;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    _Tp* operator-=(ptrdiff_t __op) const noexcept          {return fetch_sub(__op) - __op;}
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++(int) const noexcept
+  {
+    return fetch_add(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--(int) const noexcept
+  {
+    return fetch_sub(1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator++() const noexcept
+  {
+    return fetch_add(1) + 1;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator--() const noexcept
+  {
+    return fetch_sub(1) - 1;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator+=(ptrdiff_t __op) const noexcept
+  {
+    return fetch_add(__op) + __op;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY _Tp* operator-=(ptrdiff_t __op) const noexcept
+  {
+    return fetch_sub(__op) - __op;
+  }
 };
 
 // atomic_is_lock_free
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_is_lock_free(const volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_is_lock_free(const volatile atomic<_Tp>* __o) noexcept
 {
-    return __o->is_lock_free();
+  return __o->is_lock_free();
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_is_lock_free(const atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_is_lock_free(const atomic<_Tp>* __o) noexcept
 {
-    return __o->is_lock_free();
+  return __o->is_lock_free();
 }
 
 // atomic_init
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_init(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_init(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __cxx_atomic_init(&__o->__a_, __d);
+  __cxx_atomic_init(&__o->__a_, __d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_init(atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_init(atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __cxx_atomic_init(&__o->__a_, __d);
+  __cxx_atomic_init(&__o->__a_, __d);
 }
 
 // atomic_store
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_store(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __o->store(__d);
+  __o->store(__d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_store(atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store(atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    __o->store(__d);
+  __o->store(__d);
 }
 
 // atomic_store_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_store_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
 {
-    __o->store(__d, __m);
+  __o->store(__d, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_store_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_store_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_STORE_MEMORY_ORDER(__m)
 {
-    __o->store(__d, __m);
+  __o->store(__d, __m);
 }
 
 // atomic_load
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_load(const volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load(const volatile atomic<_Tp>* __o) noexcept
 {
-    return __o->load();
+  return __o->load();
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_load(const atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load(const atomic<_Tp>* __o) noexcept
 {
-    return __o->load();
+  return __o->load();
 }
 
 // atomic_load_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_load_explicit(const volatile atomic<_Tp>* __o, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load_explicit(const volatile atomic<_Tp>* __o, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
-    return __o->load(__m);
+  return __o->load(__m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_load_explicit(const atomic<_Tp>* __o, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_load_explicit(const atomic<_Tp>* __o, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
-    return __o->load(__m);
+  return __o->load(__m);
 }
 
 // atomic_exchange
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_exchange(volatile atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange(volatile atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    return __o->exchange(__d);
+  return __o->exchange(__d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_exchange(atomic<_Tp>* __o, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange(atomic<_Tp>* __o, _Tp __d) noexcept
 {
-    return __o->exchange(__d);
+  return __o->exchange(__d);
 }
 
 // atomic_exchange_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_exchange_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange_explicit(volatile atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
 {
-    return __o->exchange(__d, __m);
+  return __o->exchange(__d, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp
-atomic_exchange_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp atomic_exchange_explicit(atomic<_Tp>* __o, _Tp __d, memory_order __m) noexcept
 {
-    return __o->exchange(__d, __m);
+  return __o->exchange(__d, __m);
 }
 
 // atomic_compare_exchange_weak
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
-    return __o->compare_exchange_weak(*__e, __d);
+  return __o->compare_exchange_weak(*__e, __d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_weak(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
-    return __o->compare_exchange_weak(*__e, __d);
+  return __o->compare_exchange_weak(*__e, __d);
 }
 
 // atomic_compare_exchange_strong
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong(volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
-    return __o->compare_exchange_strong(*__e, __d);
+  return __o->compare_exchange_strong(*__e, __d);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_strong(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong(atomic<_Tp>* __o, _Tp* __e, _Tp __d) noexcept
 {
-    return __o->compare_exchange_strong(*__e, __d);
+  return __o->compare_exchange_strong(*__e, __d);
 }
 
 // atomic_compare_exchange_weak_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_weak_explicit(volatile atomic<_Tp>* __o, _Tp* __e,
-                                      _Tp __d,
-                                      memory_order __s, memory_order __f) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_weak_explicit(
+  volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
-    return __o->compare_exchange_weak(*__e, __d, __s, __f);
+  return __o->compare_exchange_weak(*__e, __d, __s, __f);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_weak_explicit(atomic<_Tp>* __o, _Tp* __e, _Tp __d,
-                                      memory_order __s, memory_order __f) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool
+atomic_compare_exchange_weak_explicit(atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
-    return __o->compare_exchange_weak(*__e, __d, __s, __f);
+  return __o->compare_exchange_weak(*__e, __d, __s, __f);
 }
 
 // atomic_compare_exchange_strong_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_strong_explicit(volatile atomic<_Tp>* __o,
-                                        _Tp* __e, _Tp __d,
-                                        memory_order __s, memory_order __f) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(
+  volatile atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
-    return __o->compare_exchange_strong(*__e, __d, __s, __f);
+  return __o->compare_exchange_strong(*__e, __d, __s, __f);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_compare_exchange_strong_explicit(atomic<_Tp>* __o, _Tp* __e,
-                                        _Tp __d,
-                                        memory_order __s, memory_order __f) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY bool atomic_compare_exchange_strong_explicit(
+  atomic<_Tp>* __o, _Tp* __e, _Tp __d, memory_order __s, memory_order __f) noexcept
   _LIBCUDACXX_CHECK_EXCHANGE_MEMORY_ORDER(__s, __f)
 {
-    return __o->compare_exchange_strong(*__e, __d, __s, __f);
+  return __o->compare_exchange_strong(*__e, __d, __s, __f);
 }
 
 // atomic_wait
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_wait(const volatile atomic<_Tp>* __o,
-                    typename atomic<_Tp>::value_type __v) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void
+atomic_wait(const volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v) noexcept
 {
-    return __o->wait(__v);
+  return __o->wait(__v);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_wait(const atomic<_Tp>* __o,
-                    typename atomic<_Tp>::value_type __v) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_wait(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v) noexcept
 {
-    return __o->wait(__v);
+  return __o->wait(__v);
 }
 
 // atomic_wait_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_wait_explicit(const volatile atomic<_Tp>* __o,
-                            typename atomic<_Tp>::value_type __v,
-                            memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void
+atomic_wait_explicit(const volatile atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
-    return __o->wait(__v, __m);
+  return __o->wait(__v, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_wait_explicit(const atomic<_Tp>* __o,
-                            typename atomic<_Tp>::value_type __v,
-                            memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void
+atomic_wait_explicit(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __v, memory_order __m) noexcept
   _LIBCUDACXX_CHECK_LOAD_MEMORY_ORDER(__m)
 {
-    return __o->wait(__v, __m);
+  return __o->wait(__v, __m);
 }
 
 // atomic_notify_one
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_notify_one(volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_one(volatile atomic<_Tp>* __o) noexcept
 {
-    __o->notify_one();
+  __o->notify_one();
 }
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_notify_one(atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_one(atomic<_Tp>* __o) noexcept
 {
-    __o->notify_one();
+  __o->notify_one();
 }
 
 // atomic_notify_one
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_notify_all(volatile atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_all(volatile atomic<_Tp>* __o) noexcept
 {
-    __o->notify_all();
+  __o->notify_all();
 }
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-void atomic_notify_all(atomic<_Tp>* __o) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY void atomic_notify_all(atomic<_Tp>* __o) noexcept
 {
-    __o->notify_all();
+  __o->notify_all();
 }
 
 // atomic_fetch_add
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_add(volatile atomic<_Tp>* __o, _Tp __op) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_add(__op);
+  return __o->fetch_add(__op);
 }
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_add(atomic<_Tp>* __o, _Tp __op) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_add(__op);
+  return __o->fetch_add(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_add(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_add(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
-    return __o->fetch_add(__op);
+  return __o->fetch_add(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_add(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_add(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
-    return __o->fetch_add(__op);
+  return __o->fetch_add(__op);
 }
 
 // atomic_fetch_add_explicit
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_add_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_add(__op, __m);
+  return __o->fetch_add(__op, __m);
 }
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_add_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_add_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_add(__op, __m);
+  return __o->fetch_add(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op,
-                          memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+atomic_fetch_add_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
-    return __o->fetch_add(__op, __m);
+  return __o->fetch_add(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
 atomic_fetch_add_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
-    return __o->fetch_add(__op, __m);
+  return __o->fetch_add(__op, __m);
 }
 
 // atomic_fetch_sub
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_sub(volatile atomic<_Tp>* __o, _Tp __op) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_sub(__op);
+  return __o->fetch_sub(__op);
 }
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_sub(atomic<_Tp>* __o, _Tp __op) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_sub(__op);
+  return __o->fetch_sub(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_sub(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_sub(volatile atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
-    return __o->fetch_sub(__op);
+  return __o->fetch_sub(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_sub(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp* atomic_fetch_sub(atomic<_Tp*>* __o, ptrdiff_t __op) noexcept
 {
-    return __o->fetch_sub(__op);
+  return __o->fetch_sub(__op);
 }
 
 // atomic_fetch_sub_explicit
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_sub_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_sub(__op, __m);
+  return __o->fetch_sub(__op, __m);
 }
 
 template <class _Tp>
 _LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    (is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value,
-    _Tp
->
-atomic_fetch_sub_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
+  __enable_if_t<(is_integral<_Tp>::value && !is_same<_Tp, bool>::value) || is_floating_point<_Tp>::value, _Tp>
+  atomic_fetch_sub_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_sub(__op, __m);
+  return __o->fetch_sub(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
-atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op,
-                          memory_order __m) noexcept
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
+atomic_fetch_sub_explicit(volatile atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
-    return __o->fetch_sub(__op, __m);
+  return __o->fetch_sub(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-_Tp*
+_LIBCUDACXX_INLINE_VISIBILITY _Tp*
 atomic_fetch_sub_explicit(atomic<_Tp*>* __o, ptrdiff_t __op, memory_order __m) noexcept
 {
-    return __o->fetch_sub(__op, __m);
+  return __o->fetch_sub(__op, __m);
 }
 
 // atomic_fetch_and
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_and(__op);
+  return __o->fetch_and(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_and(__op);
+  return __o->fetch_and(__op);
 }
 
 // atomic_fetch_and_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_and(__op, __m);
+  return __o->fetch_and(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_and_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_and(__op, __m);
+  return __o->fetch_and(__op, __m);
 }
 
 // atomic_fetch_or
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_or(__op);
+  return __o->fetch_or(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_or(__op);
+  return __o->fetch_or(__op);
 }
 
 // atomic_fetch_or_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_or(__op, __m);
+  return __o->fetch_or(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_or_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_or(__op, __m);
+  return __o->fetch_or(__op, __m);
 }
 
 // atomic_fetch_xor
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor(volatile atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_xor(__op);
+  return __o->fetch_xor(__op);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor(atomic<_Tp>* __o, _Tp __op) noexcept
 {
-    return __o->fetch_xor(__op);
+  return __o->fetch_xor(__op);
 }
 
 // atomic_fetch_xor_explicit
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor_explicit(volatile atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_xor(__op, __m);
+  return __o->fetch_xor(__op, __m);
 }
 
 template <class _Tp>
-_LIBCUDACXX_INLINE_VISIBILITY
-__enable_if_t
-<
-    is_integral<_Tp>::value && !is_same<_Tp, bool>::value,
-    _Tp
->
+_LIBCUDACXX_INLINE_VISIBILITY __enable_if_t<is_integral<_Tp>::value && !is_same<_Tp, bool>::value, _Tp>
 atomic_fetch_xor_explicit(atomic<_Tp>* __o, _Tp __op, memory_order __m) noexcept
 {
-    return __o->fetch_xor(__op, __m);
+  return __o->fetch_xor(__op, __m);
 }
 
 // flag type and operations
 
 typedef struct atomic_flag
 {
-    __cxx_atomic_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, 0> __a_;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool test(memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__cxx_atomic_load(&__a_, __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool test(memory_order __m = memory_order_seq_cst) const noexcept
-        {return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true)==__cxx_atomic_load(&__a_, __m);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool test_and_set(memory_order __m = memory_order_seq_cst) volatile noexcept
-        {return __cxx_atomic_exchange(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool test_and_set(memory_order __m = memory_order_seq_cst) noexcept
-        {return __cxx_atomic_exchange(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void clear(memory_order __m = memory_order_seq_cst) volatile noexcept
-        {__cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void clear(memory_order __m = memory_order_seq_cst) noexcept
-        {__cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);}
+  __cxx_atomic_impl<_LIBCUDACXX_ATOMIC_FLAG_TYPE, 0> __a_;
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool test(memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true) == __cxx_atomic_load(&__a_, __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool test(memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    return _LIBCUDACXX_ATOMIC_FLAG_TYPE(true) == __cxx_atomic_load(&__a_, __m);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool test_and_set(memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    return __cxx_atomic_exchange(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool test_and_set(memory_order __m = memory_order_seq_cst) noexcept
+  {
+    return __cxx_atomic_exchange(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(true), __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void clear(memory_order __m = memory_order_seq_cst) volatile noexcept
+  {
+    __cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void clear(memory_order __m = memory_order_seq_cst) noexcept
+  {
+    __cxx_atomic_store(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(false), __m);
+  }
 
 #if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void wait(bool __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
-        {__cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void wait(bool __v, memory_order __m = memory_order_seq_cst) const noexcept
-        {__cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void notify_one() volatile noexcept
-        {__cxx_atomic_notify_one(&__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void notify_one() noexcept
-        {__cxx_atomic_notify_one(&__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void notify_all() volatile noexcept
-        {__cxx_atomic_notify_all(&__a_);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void notify_all() noexcept
-        {__cxx_atomic_notify_all(&__a_);}
+  _LIBCUDACXX_INLINE_VISIBILITY void wait(bool __v, memory_order __m = memory_order_seq_cst) const volatile noexcept
+  {
+    __cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void wait(bool __v, memory_order __m = memory_order_seq_cst) const noexcept
+  {
+    __cxx_atomic_wait(&__a_, _LIBCUDACXX_ATOMIC_FLAG_TYPE(__v), __m);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() volatile noexcept
+  {
+    __cxx_atomic_notify_one(&__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_one() noexcept
+  {
+    __cxx_atomic_notify_one(&__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() volatile noexcept
+  {
+    __cxx_atomic_notify_all(&__a_);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY void notify_all() noexcept
+  {
+    __cxx_atomic_notify_all(&__a_);
+  }
 #endif
 
-    atomic_flag() noexcept = default;
+  atomic_flag() noexcept = default;
 
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-    atomic_flag(bool __b) noexcept : __a_(__b) {} // EXTENSION
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr atomic_flag(bool __b) noexcept
+      : __a_(__b)
+  {} // EXTENSION
 
-    atomic_flag(const atomic_flag&) = delete;
-    atomic_flag& operator=(const atomic_flag&) = delete;
-    atomic_flag& operator=(const atomic_flag&) volatile = delete;
+  atomic_flag(const atomic_flag&)                     = delete;
+  atomic_flag& operator=(const atomic_flag&)          = delete;
+  atomic_flag& operator=(const atomic_flag&) volatile = delete;
 } atomic_flag;
 
-
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test(const volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test(const volatile atomic_flag* __o) noexcept
 {
-    return __o->test();
+  return __o->test();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test(const atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test(const atomic_flag* __o) noexcept
 {
-    return __o->test();
+  return __o->test();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
+inline _LIBCUDACXX_INLINE_VISIBILITY bool
 atomic_flag_test_explicit(const volatile atomic_flag* __o, memory_order __m) noexcept
 {
-    return __o->test(__m);
+  return __o->test(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test_explicit(const atomic_flag* __o, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_explicit(const atomic_flag* __o, memory_order __m) noexcept
 {
-    return __o->test(__m);
+  return __o->test(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test_and_set(volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set(volatile atomic_flag* __o) noexcept
 {
-    return __o->test_and_set();
+  return __o->test_and_set();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test_and_set(atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set(atomic_flag* __o) noexcept
 {
-    return __o->test_and_set();
+  return __o->test_and_set();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
+inline _LIBCUDACXX_INLINE_VISIBILITY bool
 atomic_flag_test_and_set_explicit(volatile atomic_flag* __o, memory_order __m) noexcept
 {
-    return __o->test_and_set(__m);
+  return __o->test_and_set(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bool
-atomic_flag_test_and_set_explicit(atomic_flag* __o, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bool atomic_flag_test_and_set_explicit(atomic_flag* __o, memory_order __m) noexcept
 {
-    return __o->test_and_set(__m);
+  return __o->test_and_set(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_clear(volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear(volatile atomic_flag* __o) noexcept
 {
-    __o->clear();
+  __o->clear();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_clear(atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear(atomic_flag* __o) noexcept
 {
-    __o->clear();
+  __o->clear();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
+inline _LIBCUDACXX_INLINE_VISIBILITY void
 atomic_flag_clear_explicit(volatile atomic_flag* __o, memory_order __m) noexcept
 {
-    __o->clear(__m);
+  __o->clear(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_clear_explicit(atomic_flag* __o, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_clear_explicit(atomic_flag* __o, memory_order __m) noexcept
 {
-    __o->clear(__m);
+  __o->clear(__m);
 }
 
 #if !defined(__CUDA_MINIMUM_ARCH__) || __CUDA_MINIMUM_ARCH__ >= 700
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_wait(const volatile atomic_flag* __o, bool __v) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_wait(const volatile atomic_flag* __o, bool __v) noexcept
 {
-    __o->wait(__v);
+  __o->wait(__v);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_wait(const atomic_flag* __o, bool __v) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_wait(const atomic_flag* __o, bool __v) noexcept
 {
-    __o->wait(__v);
+  __o->wait(__v);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_wait_explicit(const volatile atomic_flag* __o,
-                          bool __v, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void
+atomic_flag_wait_explicit(const volatile atomic_flag* __o, bool __v, memory_order __m) noexcept
 {
-    __o->wait(__v, __m);
+  __o->wait(__v, __m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_wait_explicit(const atomic_flag* __o,
-                          bool __v, memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void
+atomic_flag_wait_explicit(const atomic_flag* __o, bool __v, memory_order __m) noexcept
 {
-    __o->wait(__v, __m);
+  __o->wait(__v, __m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_notify_one(volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_one(volatile atomic_flag* __o) noexcept
 {
-    __o->notify_one();
+  __o->notify_one();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_notify_one(atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_one(atomic_flag* __o) noexcept
 {
-    __o->notify_one();
+  __o->notify_one();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_notify_all(volatile atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_all(volatile atomic_flag* __o) noexcept
 {
-    __o->notify_all();
+  __o->notify_all();
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_flag_notify_all(atomic_flag* __o) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_flag_notify_all(atomic_flag* __o) noexcept
 {
-    __o->notify_all();
+  __o->notify_all();
 }
 
 #endif
 
 // fences
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_thread_fence(memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_thread_fence(memory_order __m) noexcept
 {
-    __cxx_atomic_thread_fence(__m);
+  __cxx_atomic_thread_fence(__m);
 }
 
-inline _LIBCUDACXX_INLINE_VISIBILITY
-void
-atomic_signal_fence(memory_order __m) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY void atomic_signal_fence(memory_order __m) noexcept
 {
-    __cxx_atomic_signal_fence(__m);
+  __cxx_atomic_signal_fence(__m);
 }
 
 // Atomics for standard typedef types
 
-typedef atomic<bool>               atomic_bool;
-typedef atomic<char>               atomic_char;
-typedef atomic<signed char>        atomic_schar;
-typedef atomic<unsigned char>      atomic_uchar;
-typedef atomic<short>              atomic_short;
-typedef atomic<unsigned short>     atomic_ushort;
-typedef atomic<int>                atomic_int;
-typedef atomic<unsigned int>       atomic_uint;
-typedef atomic<long>               atomic_long;
-typedef atomic<unsigned long>      atomic_ulong;
-typedef atomic<long long>          atomic_llong;
+typedef atomic<bool> atomic_bool;
+typedef atomic<char> atomic_char;
+typedef atomic<signed char> atomic_schar;
+typedef atomic<unsigned char> atomic_uchar;
+typedef atomic<short> atomic_short;
+typedef atomic<unsigned short> atomic_ushort;
+typedef atomic<int> atomic_int;
+typedef atomic<unsigned int> atomic_uint;
+typedef atomic<long> atomic_long;
+typedef atomic<unsigned long> atomic_ulong;
+typedef atomic<long long> atomic_llong;
 typedef atomic<unsigned long long> atomic_ullong;
-typedef atomic<char16_t>           atomic_char16_t;
-typedef atomic<char32_t>           atomic_char32_t;
-typedef atomic<wchar_t>            atomic_wchar_t;
+typedef atomic<char16_t> atomic_char16_t;
+typedef atomic<char32_t> atomic_char32_t;
+typedef atomic<wchar_t> atomic_wchar_t;
 
-typedef atomic<int_least8_t>   atomic_int_least8_t;
-typedef atomic<uint_least8_t>  atomic_uint_least8_t;
-typedef atomic<int_least16_t>  atomic_int_least16_t;
+typedef atomic<int_least8_t> atomic_int_least8_t;
+typedef atomic<uint_least8_t> atomic_uint_least8_t;
+typedef atomic<int_least16_t> atomic_int_least16_t;
 typedef atomic<uint_least16_t> atomic_uint_least16_t;
-typedef atomic<int_least32_t>  atomic_int_least32_t;
+typedef atomic<int_least32_t> atomic_int_least32_t;
 typedef atomic<uint_least32_t> atomic_uint_least32_t;
-typedef atomic<int_least64_t>  atomic_int_least64_t;
+typedef atomic<int_least64_t> atomic_int_least64_t;
 typedef atomic<uint_least64_t> atomic_uint_least64_t;
 
-typedef atomic<int_fast8_t>   atomic_int_fast8_t;
-typedef atomic<uint_fast8_t>  atomic_uint_fast8_t;
-typedef atomic<int_fast16_t>  atomic_int_fast16_t;
+typedef atomic<int_fast8_t> atomic_int_fast8_t;
+typedef atomic<uint_fast8_t> atomic_uint_fast8_t;
+typedef atomic<int_fast16_t> atomic_int_fast16_t;
 typedef atomic<uint_fast16_t> atomic_uint_fast16_t;
-typedef atomic<int_fast32_t>  atomic_int_fast32_t;
+typedef atomic<int_fast32_t> atomic_int_fast32_t;
 typedef atomic<uint_fast32_t> atomic_uint_fast32_t;
-typedef atomic<int_fast64_t>  atomic_int_fast64_t;
+typedef atomic<int_fast64_t> atomic_int_fast64_t;
 typedef atomic<uint_fast64_t> atomic_uint_fast64_t;
 
-typedef atomic< int8_t>  atomic_int8_t;
-typedef atomic<uint8_t>  atomic_uint8_t;
-typedef atomic< int16_t> atomic_int16_t;
+typedef atomic<int8_t> atomic_int8_t;
+typedef atomic<uint8_t> atomic_uint8_t;
+typedef atomic<int16_t> atomic_int16_t;
 typedef atomic<uint16_t> atomic_uint16_t;
-typedef atomic< int32_t> atomic_int32_t;
+typedef atomic<int32_t> atomic_int32_t;
 typedef atomic<uint32_t> atomic_uint32_t;
-typedef atomic< int64_t> atomic_int64_t;
+typedef atomic<int64_t> atomic_int64_t;
 typedef atomic<uint64_t> atomic_uint64_t;
 
-typedef atomic<intptr_t>  atomic_intptr_t;
+typedef atomic<intptr_t> atomic_intptr_t;
 typedef atomic<uintptr_t> atomic_uintptr_t;
-typedef atomic<size_t>    atomic_size_t;
+typedef atomic<size_t> atomic_size_t;
 typedef atomic<ptrdiff_t> atomic_ptrdiff_t;
-typedef atomic<intmax_t>  atomic_intmax_t;
+typedef atomic<intmax_t> atomic_intmax_t;
 typedef atomic<uintmax_t> atomic_uintmax_t;
 
 static_assert(ATOMIC_INT_LOCK_FREE, "This library assumes atomic<int> is lock-free.");
 
-typedef atomic<int>       atomic_signed_lock_free;
-typedef atomic<unsigned>  atomic_unsigned_lock_free;
+typedef atomic<int> atomic_signed_lock_free;
+typedef atomic<unsigned> atomic_unsigned_lock_free;
 
-#define ATOMIC_FLAG_INIT {false}
-#define ATOMIC_VAR_INIT(__v) {__v}
+#define ATOMIC_FLAG_INIT \
+  {                      \
+    false                \
+  }
+#define ATOMIC_VAR_INIT(__v) \
+  {                          \
+    __v                      \
+  }
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #include <cuda/std/__cuda/atomic.h>
 #include <cuda/std/detail/libcxx/include/__pragma_pop>
 
-#endif  // _LIBCUDACXX_ATOMIC
+#endif // _LIBCUDACXX_ATOMIC
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/bitset b/libcudacxx/include/cuda/std/detail/libcxx/include/bitset
index c475bfb7d9f..ebf17ae02a2 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/bitset
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/bitset
@@ -74,14 +74,10 @@ public:
     template <class charT, class traits>
         basic_string<charT, traits, allocator<charT> > to_string(charT zero = charT('0'), charT one = charT('1')) const;
     template <class charT>
-        basic_string<charT, char_traits<charT>, allocator<charT> > to_string(charT zero = charT('0'), charT one = charT('1')) const;
-    basic_string<char, char_traits<char>, allocator<char> > to_string(char zero = '0', char one = '1') const;
-    size_t count() const noexcept;
-    constexpr size_t size() const noexcept;
-    bool operator==(const bitset& rhs) const noexcept;
-    bool operator!=(const bitset& rhs) const noexcept;
-    bool test(size_t pos) const;
-    bool all() const noexcept;
+        basic_string<charT, char_traits<charT>, allocator<charT> > to_string(charT zero = charT('0'), charT one =
+charT('1')) const; basic_string<char, char_traits<char>, allocator<char> > to_string(char zero = '0', char one = '1')
+const; size_t count() const noexcept; constexpr size_t size() const noexcept; bool operator==(const bitset& rhs) const
+noexcept; bool operator!=(const bitset& rhs) const noexcept; bool test(size_t pos) const; bool all() const noexcept;
     bool any() const noexcept;
     bool none() const noexcept;
     bitset operator<<(size_t pos) const noexcept;
@@ -112,14 +108,14 @@ template <size_t N> struct hash<std::bitset<N>>;
 
 */
 
-#include <__config>
 #include <__bit_reference>
-#include <cstddef>
+#include <__config>
+#include <__functional_base>
 #include <climits>
-#include <string>
-#include <stdexcept>
+#include <cstddef>
 #include <iosfwd>
-#include <__functional_base>
+#include <stdexcept>
+#include <string>
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -132,929 +128,901 @@ template <size_t N> struct hash<std::bitset<N>>;
 _LIBCUDACXX_PUSH_MACROS
 #include <__undef_macros>
 
-
 _LIBCUDACXX_BEGIN_NAMESPACE_STD
 
 template <size_t _N_words, size_t _Size>
 class __bitset;
 
 template <size_t _N_words, size_t _Size>
-struct __has_storage_type<__bitset<_N_words, _Size> >
+struct __has_storage_type<__bitset<_N_words, _Size>>
 {
-    static const bool value = true;
+  static const bool value = true;
 };
 
 template <size_t _N_words, size_t _Size>
 class __bitset
 {
 public:
-    typedef ptrdiff_t              difference_type;
-    typedef size_t                 size_type;
-    typedef size_type              __storage_type;
+  typedef ptrdiff_t difference_type;
+  typedef size_t size_type;
+  typedef size_type __storage_type;
+
 protected:
-    typedef __bitset __self;
-    typedef       __storage_type*  __storage_pointer;
-    typedef const __storage_type*  __const_storage_pointer;
-    static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
-
-    friend class __bit_reference<__bitset>;
-    friend class __bit_const_reference<__bitset>;
-    friend class __bit_iterator<__bitset, false>;
-    friend class __bit_iterator<__bitset, true>;
-    friend struct __bit_array<__bitset>;
-
-    __storage_type __first_[_N_words];
-
-    typedef __bit_reference<__bitset>                  reference;
-    typedef __bit_const_reference<__bitset>            const_reference;
-    typedef __bit_iterator<__bitset, false>            iterator;
-    typedef __bit_iterator<__bitset, true>             const_iterator;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr __bitset() noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    explicit constexpr __bitset(unsigned long long __v) noexcept;
-
-    _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t __pos) noexcept
-        {return reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);}
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t __pos) const noexcept
-        {return const_reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);}
-    _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t __pos) noexcept
-        {return iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);}
-    _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t __pos) const noexcept
-        {return const_iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void operator&=(const __bitset& __v) noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void operator|=(const __bitset& __v) noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void operator^=(const __bitset& __v) noexcept;
-
-    void flip() noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const
-        {return to_ulong(integral_constant<bool, _Size < sizeof(unsigned long) * CHAR_BIT>());}
-    _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const
-        {return to_ullong(integral_constant<bool, _Size < sizeof(unsigned long long) * CHAR_BIT>());}
-
-    bool all() const noexcept;
-    bool any() const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    size_t __hash_code() const noexcept;
+  typedef __bitset __self;
+  typedef __storage_type* __storage_pointer;
+  typedef const __storage_type* __const_storage_pointer;
+  static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
+
+  friend class __bit_reference<__bitset>;
+  friend class __bit_const_reference<__bitset>;
+  friend class __bit_iterator<__bitset, false>;
+  friend class __bit_iterator<__bitset, true>;
+  friend struct __bit_array<__bitset>;
+
+  __storage_type __first_[_N_words];
+
+  typedef __bit_reference<__bitset> reference;
+  typedef __bit_const_reference<__bitset> const_reference;
+  typedef __bit_iterator<__bitset, false> iterator;
+  typedef __bit_iterator<__bitset, true> const_iterator;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long __v) noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t __pos) noexcept
+  {
+    return reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t __pos) const noexcept
+  {
+    return const_reference(__first_ + __pos / __bits_per_word, __storage_type(1) << __pos % __bits_per_word);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t __pos) noexcept
+  {
+    return iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t __pos) const noexcept
+  {
+    return const_iterator(__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void operator&=(const __bitset& __v) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY void operator|=(const __bitset& __v) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY void operator^=(const __bitset& __v) noexcept;
+
+  void flip() noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const
+  {
+    return to_ulong(integral_constant < bool, _Size<sizeof(unsigned long) * CHAR_BIT>());
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const
+  {
+    return to_ullong(integral_constant < bool, _Size<sizeof(unsigned long long) * CHAR_BIT>());
+  }
+
+  bool all() const noexcept;
+  bool any() const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept;
+
 private:
-    unsigned long to_ulong(false_type) const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    unsigned long to_ulong(true_type) const;
-    unsigned long long to_ullong(false_type) const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    unsigned long long to_ullong(true_type) const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    unsigned long long to_ullong(true_type, false_type) const;
-    unsigned long long to_ullong(true_type, true_type) const;
+  unsigned long to_ulong(false_type) const;
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong(true_type) const;
+  unsigned long long to_ullong(false_type) const;
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong(true_type) const;
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong(true_type, false_type) const;
+  unsigned long long to_ullong(true_type, true_type) const;
 };
 
 template <size_t _N_words, size_t _Size>
-inline constexpr
-__bitset<_N_words, _Size>::__bitset() noexcept
+inline constexpr __bitset<_N_words, _Size>::__bitset() noexcept
     : __first_{0}
 {}
 
 template <size_t _N_words, size_t _Size>
-inline
-constexpr
-__bitset<_N_words, _Size>::__bitset(unsigned long long __v) noexcept
+inline constexpr __bitset<_N_words, _Size>::__bitset(unsigned long long __v) noexcept
 #if __SIZEOF_SIZE_T__ == 8
     : __first_{__v}
 #elif __SIZEOF_SIZE_T__ == 4
-    : __first_{static_cast<__storage_type>(__v),
-                _Size >= 2 * __bits_per_word ? static_cast<__storage_type>(__v >> __bits_per_word)
-                : static_cast<__storage_type>((__v >> __bits_per_word) & (__storage_type(1) << (_Size - __bits_per_word)) - 1)}
+    : __first_{
+      static_cast<__storage_type>(__v),
+      _Size >= 2 * __bits_per_word
+        ? static_cast<__storage_type>(__v >> __bits_per_word)
+        : static_cast<__storage_type>((__v >> __bits_per_word) & (__storage_type(1) << (_Size - __bits_per_word)) - 1)}
 #else
-#error This constructor has not been ported to this platform
+#  error This constructor has not been ported to this platform
 #endif
 {}
 
 template <size_t _N_words, size_t _Size>
-inline
-void
-__bitset<_N_words, _Size>::operator&=(const __bitset& __v) noexcept
+inline void __bitset<_N_words, _Size>::operator&=(const __bitset& __v) noexcept
 {
-    for (size_type __i = 0; __i < _N_words; ++__i)
-        __first_[__i] &= __v.__first_[__i];
+  for (size_type __i = 0; __i < _N_words; ++__i)
+  {
+    __first_[__i] &= __v.__first_[__i];
+  }
 }
 
 template <size_t _N_words, size_t _Size>
-inline
-void
-__bitset<_N_words, _Size>::operator|=(const __bitset& __v) noexcept
+inline void __bitset<_N_words, _Size>::operator|=(const __bitset& __v) noexcept
 {
-    for (size_type __i = 0; __i < _N_words; ++__i)
-        __first_[__i] |= __v.__first_[__i];
+  for (size_type __i = 0; __i < _N_words; ++__i)
+  {
+    __first_[__i] |= __v.__first_[__i];
+  }
 }
 
 template <size_t _N_words, size_t _Size>
-inline
-void
-__bitset<_N_words, _Size>::operator^=(const __bitset& __v) noexcept
+inline void __bitset<_N_words, _Size>::operator^=(const __bitset& __v) noexcept
 {
-    for (size_type __i = 0; __i < _N_words; ++__i)
-        __first_[__i] ^= __v.__first_[__i];
+  for (size_type __i = 0; __i < _N_words; ++__i)
+  {
+    __first_[__i] ^= __v.__first_[__i];
+  }
 }
 
 template <size_t _N_words, size_t _Size>
-void
-__bitset<_N_words, _Size>::flip() noexcept
-{
-    // do middle whole words
-    size_type __n = _Size;
-    __storage_pointer __p = __first_;
-    for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
-        *__p = ~*__p;
-    // do last partial word
-    if (__n > 0)
-    {
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        __storage_type __b = *__p & __m;
-        *__p &= ~__m;
-        *__p |= ~__b & __m;
-    }
+void __bitset<_N_words, _Size>::flip() noexcept
+{
+  // do middle whole words
+  size_type __n         = _Size;
+  __storage_pointer __p = __first_;
+  for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
+  {
+    *__p = ~*__p;
+  }
+  // do last partial word
+  if (__n > 0)
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    __storage_type __b = *__p & __m;
+    *__p &= ~__m;
+    *__p |= ~__b & __m;
+  }
 }
 
 template <size_t _N_words, size_t _Size>
-unsigned long
-__bitset<_N_words, _Size>::to_ulong(false_type) const
+unsigned long __bitset<_N_words, _Size>::to_ulong(false_type) const
 {
-    const_iterator __e = __make_iter(_Size);
-    const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long) * CHAR_BIT), __e, true);
-    if (__i != __e)
-        __throw_overflow_error("bitset to_ulong overflow error");
+  const_iterator __e = __make_iter(_Size);
+  const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long) * CHAR_BIT), __e, true);
+  if (__i != __e)
+  {
+    __throw_overflow_error("bitset to_ulong overflow error");
+  }
 
-    return __first_[0];
+  return __first_[0];
 }
 
 template <size_t _N_words, size_t _Size>
-inline
-unsigned long
-__bitset<_N_words, _Size>::to_ulong(true_type) const
+inline unsigned long __bitset<_N_words, _Size>::to_ulong(true_type) const
 {
-    return __first_[0];
+  return __first_[0];
 }
 
 template <size_t _N_words, size_t _Size>
-unsigned long long
-__bitset<_N_words, _Size>::to_ullong(false_type) const
+unsigned long long __bitset<_N_words, _Size>::to_ullong(false_type) const
 {
-    const_iterator __e = __make_iter(_Size);
-    const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long long) * CHAR_BIT), __e, true);
-    if (__i != __e)
-        __throw_overflow_error("bitset to_ullong overflow error");
+  const_iterator __e = __make_iter(_Size);
+  const_iterator __i = _CUDA_VSTD::find(__make_iter(sizeof(unsigned long long) * CHAR_BIT), __e, true);
+  if (__i != __e)
+  {
+    __throw_overflow_error("bitset to_ullong overflow error");
+  }
 
-    return to_ullong(true_type());
+  return to_ullong(true_type());
 }
 
 template <size_t _N_words, size_t _Size>
-inline
-unsigned long long
-__bitset<_N_words, _Size>::to_ullong(true_type) const
+inline unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type) const
 {
-    return to_ullong(true_type(), integral_constant<bool, sizeof(__storage_type) < sizeof(unsigned long long)>());
+  return to_ullong(true_type(), integral_constant<bool, sizeof(__storage_type) < sizeof(unsigned long long)>());
 }
 
 template <size_t _N_words, size_t _Size>
-inline
-unsigned long long
-__bitset<_N_words, _Size>::to_ullong(true_type, false_type) const
+inline unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type, false_type) const
 {
-    return __first_[0];
+  return __first_[0];
 }
 
 template <size_t _N_words, size_t _Size>
-unsigned long long
-__bitset<_N_words, _Size>::to_ullong(true_type, true_type) const
-{
-    unsigned long long __r = __first_[0];
-    for (std::size_t __i = 1; __i < sizeof(unsigned long long) / sizeof(__storage_type); ++__i)
-        __r |= static_cast<unsigned long long>(__first_[__i]) << (sizeof(__storage_type) * CHAR_BIT);
-    return __r;
+unsigned long long __bitset<_N_words, _Size>::to_ullong(true_type, true_type) const
+{
+  unsigned long long __r = __first_[0];
+  for (std::size_t __i = 1; __i < sizeof(unsigned long long) / sizeof(__storage_type); ++__i)
+  {
+    __r |= static_cast<unsigned long long>(__first_[__i]) << (sizeof(__storage_type) * CHAR_BIT);
+  }
+  return __r;
 }
 
 template <size_t _N_words, size_t _Size>
-bool
-__bitset<_N_words, _Size>::all() const noexcept
-{
-    // do middle whole words
-    size_type __n = _Size;
-    __const_storage_pointer __p = __first_;
-    for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
-        if (~*__p)
-            return false;
-    // do last partial word
-    if (__n > 0)
+bool __bitset<_N_words, _Size>::all() const noexcept
+{
+  // do middle whole words
+  size_type __n               = _Size;
+  __const_storage_pointer __p = __first_;
+  for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
+  {
+    if (~*__p)
     {
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        if (~*__p & __m)
-            return false;
+      return false;
     }
-    return true;
+  }
+  // do last partial word
+  if (__n > 0)
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    if (~*__p & __m)
+    {
+      return false;
+    }
+  }
+  return true;
 }
 
 template <size_t _N_words, size_t _Size>
-bool
-__bitset<_N_words, _Size>::any() const noexcept
-{
-    // do middle whole words
-    size_type __n = _Size;
-    __const_storage_pointer __p = __first_;
-    for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
-        if (*__p)
-            return true;
-    // do last partial word
-    if (__n > 0)
+bool __bitset<_N_words, _Size>::any() const noexcept
+{
+  // do middle whole words
+  size_type __n               = _Size;
+  __const_storage_pointer __p = __first_;
+  for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
+  {
+    if (*__p)
     {
-        __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-        if (*__p & __m)
-            return true;
+      return true;
     }
-    return false;
+  }
+  // do last partial word
+  if (__n > 0)
+  {
+    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    if (*__p & __m)
+    {
+      return true;
+    }
+  }
+  return false;
 }
 
 template <size_t _N_words, size_t _Size>
-inline
-size_t
-__bitset<_N_words, _Size>::__hash_code() const noexcept
-{
-    size_t __h = 0;
-    for (size_type __i = 0; __i < _N_words; ++__i)
-        __h ^= __first_[__i];
-    return __h;
+inline size_t __bitset<_N_words, _Size>::__hash_code() const noexcept
+{
+  size_t __h = 0;
+  for (size_type __i = 0; __i < _N_words; ++__i)
+  {
+    __h ^= __first_[__i];
+  }
+  return __h;
 }
 
 template <size_t _Size>
 class __bitset<1, _Size>
 {
 public:
-    typedef ptrdiff_t              difference_type;
-    typedef size_t                 size_type;
-    typedef size_type              __storage_type;
-protected:
-    typedef __bitset __self;
-    typedef       __storage_type*  __storage_pointer;
-    typedef const __storage_type*  __const_storage_pointer;
-    static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
-
-    friend class __bit_reference<__bitset>;
-    friend class __bit_const_reference<__bitset>;
-    friend class __bit_iterator<__bitset, false>;
-    friend class __bit_iterator<__bitset, true>;
-    friend struct __bit_array<__bitset>;
-
-    __storage_type __first_;
-
-    typedef __bit_reference<__bitset>                  reference;
-    typedef __bit_const_reference<__bitset>            const_reference;
-    typedef __bit_iterator<__bitset, false>            iterator;
-    typedef __bit_iterator<__bitset, true>             const_iterator;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr __bitset() noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    explicit constexpr __bitset(unsigned long long __v) noexcept;
-
-    _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t __pos) noexcept
-        {return reference(&__first_, __storage_type(1) << __pos);}
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t __pos) const noexcept
-        {return const_reference(&__first_, __storage_type(1) << __pos);}
-    _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t __pos) noexcept
-        {return iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);}
-    _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t __pos) const noexcept
-        {return const_iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void operator&=(const __bitset& __v) noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void operator|=(const __bitset& __v) noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void operator^=(const __bitset& __v) noexcept;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    void flip() noexcept;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    unsigned long to_ulong() const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    unsigned long long to_ullong() const;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool all() const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool any() const noexcept;
+  typedef ptrdiff_t difference_type;
+  typedef size_t size_type;
+  typedef size_type __storage_type;
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    size_t __hash_code() const noexcept;
+protected:
+  typedef __bitset __self;
+  typedef __storage_type* __storage_pointer;
+  typedef const __storage_type* __const_storage_pointer;
+  static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
+
+  friend class __bit_reference<__bitset>;
+  friend class __bit_const_reference<__bitset>;
+  friend class __bit_iterator<__bitset, false>;
+  friend class __bit_iterator<__bitset, true>;
+  friend struct __bit_array<__bitset>;
+
+  __storage_type __first_;
+
+  typedef __bit_reference<__bitset> reference;
+  typedef __bit_const_reference<__bitset> const_reference;
+  typedef __bit_iterator<__bitset, false> iterator;
+  typedef __bit_iterator<__bitset, true> const_iterator;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long __v) noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t __pos) noexcept
+  {
+    return reference(&__first_, __storage_type(1) << __pos);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t __pos) const noexcept
+  {
+    return const_reference(&__first_, __storage_type(1) << __pos);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t __pos) noexcept
+  {
+    return iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t __pos) const noexcept
+  {
+    return const_iterator(&__first_ + __pos / __bits_per_word, __pos % __bits_per_word);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void operator&=(const __bitset& __v) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY void operator|=(const __bitset& __v) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY void operator^=(const __bitset& __v) noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const;
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const;
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool all() const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bool any() const noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept;
 };
 
 template <size_t _Size>
-inline constexpr
-__bitset<1, _Size>::__bitset() noexcept
+inline constexpr __bitset<1, _Size>::__bitset() noexcept
     : __first_(0)
-{
-}
+{}
 
 template <size_t _Size>
-inline constexpr
-__bitset<1, _Size>::__bitset(unsigned long long __v) noexcept
-    : __first_(
-        _Size == __bits_per_word ? static_cast<__storage_type>(__v)
-                                 : static_cast<__storage_type>(__v) & ((__storage_type(1) << _Size) - 1)
-    )
-{
-}
+inline constexpr __bitset<1, _Size>::__bitset(unsigned long long __v) noexcept
+    : __first_(_Size == __bits_per_word ? static_cast<__storage_type>(__v)
+                                        : static_cast<__storage_type>(__v) & ((__storage_type(1) << _Size) - 1))
+{}
 
 template <size_t _Size>
-inline
-void
-__bitset<1, _Size>::operator&=(const __bitset& __v) noexcept
+inline void __bitset<1, _Size>::operator&=(const __bitset& __v) noexcept
 {
-    __first_ &= __v.__first_;
+  __first_ &= __v.__first_;
 }
 
 template <size_t _Size>
-inline
-void
-__bitset<1, _Size>::operator|=(const __bitset& __v) noexcept
+inline void __bitset<1, _Size>::operator|=(const __bitset& __v) noexcept
 {
-    __first_ |= __v.__first_;
+  __first_ |= __v.__first_;
 }
 
 template <size_t _Size>
-inline
-void
-__bitset<1, _Size>::operator^=(const __bitset& __v) noexcept
+inline void __bitset<1, _Size>::operator^=(const __bitset& __v) noexcept
 {
-    __first_ ^= __v.__first_;
+  __first_ ^= __v.__first_;
 }
 
 template <size_t _Size>
-inline
-void
-__bitset<1, _Size>::flip() noexcept
+inline void __bitset<1, _Size>::flip() noexcept
 {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
-    __first_ = ~__first_;
-    __first_ &= __m;
+  __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
+  __first_           = ~__first_;
+  __first_ &= __m;
 }
 
 template <size_t _Size>
-inline
-unsigned long
-__bitset<1, _Size>::to_ulong() const
+inline unsigned long __bitset<1, _Size>::to_ulong() const
 {
-    return __first_;
+  return __first_;
 }
 
 template <size_t _Size>
-inline
-unsigned long long
-__bitset<1, _Size>::to_ullong() const
+inline unsigned long long __bitset<1, _Size>::to_ullong() const
 {
-    return __first_;
+  return __first_;
 }
 
 template <size_t _Size>
-inline
-bool
-__bitset<1, _Size>::all() const noexcept
+inline bool __bitset<1, _Size>::all() const noexcept
 {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
-    return !(~__first_ & __m);
+  __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
+  return !(~__first_ & __m);
 }
 
 template <size_t _Size>
-inline
-bool
-__bitset<1, _Size>::any() const noexcept
+inline bool __bitset<1, _Size>::any() const noexcept
 {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
-    return __first_ & __m;
+  __storage_type __m = ~__storage_type(0) >> (__bits_per_word - _Size);
+  return __first_ & __m;
 }
 
 template <size_t _Size>
-inline
-size_t
-__bitset<1, _Size>::__hash_code() const noexcept
+inline size_t __bitset<1, _Size>::__hash_code() const noexcept
 {
-    return __first_;
+  return __first_;
 }
 
 template <>
 class __bitset<0, 0>
 {
 public:
-    typedef ptrdiff_t              difference_type;
-    typedef size_t                 size_type;
-    typedef size_type              __storage_type;
+  typedef ptrdiff_t difference_type;
+  typedef size_t size_type;
+  typedef size_type __storage_type;
+
 protected:
-    typedef __bitset __self;
-    typedef       __storage_type*  __storage_pointer;
-    typedef const __storage_type*  __const_storage_pointer;
-    static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
-
-    friend class __bit_reference<__bitset>;
-    friend class __bit_const_reference<__bitset>;
-    friend class __bit_iterator<__bitset, false>;
-    friend class __bit_iterator<__bitset, true>;
-    friend struct __bit_array<__bitset>;
-
-    typedef __bit_reference<__bitset>                  reference;
-    typedef __bit_const_reference<__bitset>            const_reference;
-    typedef __bit_iterator<__bitset, false>            iterator;
-    typedef __bit_iterator<__bitset, true>             const_iterator;
-
-    _LIBCUDACXX_INLINE_VISIBILITY
-    constexpr __bitset() noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    explicit constexpr __bitset(unsigned long long) noexcept;
-
-    _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t) noexcept
-        {return reference(0, 1);}
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t) const noexcept
-        {return const_reference(0, 1);}
-    _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t) noexcept
-        {return iterator(0, 0);}
-    _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t) const noexcept
-        {return const_iterator(0, 0);}
-
-    _LIBCUDACXX_INLINE_VISIBILITY void operator&=(const __bitset&) noexcept {}
-    _LIBCUDACXX_INLINE_VISIBILITY void operator|=(const __bitset&) noexcept {}
-    _LIBCUDACXX_INLINE_VISIBILITY void operator^=(const __bitset&) noexcept {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept {}
-
-    _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const {return 0;}
-    _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const {return 0;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY bool all() const noexcept {return true;}
-    _LIBCUDACXX_INLINE_VISIBILITY bool any() const noexcept {return false;}
-
-    _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept {return 0;}
+  typedef __bitset __self;
+  typedef __storage_type* __storage_pointer;
+  typedef const __storage_type* __const_storage_pointer;
+  static const unsigned __bits_per_word = static_cast<unsigned>(sizeof(__storage_type) * CHAR_BIT);
+
+  friend class __bit_reference<__bitset>;
+  friend class __bit_const_reference<__bitset>;
+  friend class __bit_iterator<__bitset, false>;
+  friend class __bit_iterator<__bitset, true>;
+  friend struct __bit_array<__bitset>;
+
+  typedef __bit_reference<__bitset> reference;
+  typedef __bit_const_reference<__bitset> const_reference;
+  typedef __bit_iterator<__bitset, false> iterator;
+  typedef __bit_iterator<__bitset, true> const_iterator;
+
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr __bitset() noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY explicit constexpr __bitset(unsigned long long) noexcept;
+
+  _LIBCUDACXX_INLINE_VISIBILITY reference __make_ref(size_t) noexcept
+  {
+    return reference(0, 1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference __make_ref(size_t) const noexcept
+  {
+    return const_reference(0, 1);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY iterator __make_iter(size_t) noexcept
+  {
+    return iterator(0, 0);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY const_iterator __make_iter(size_t) const noexcept
+  {
+    return const_iterator(0, 0);
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY void operator&=(const __bitset&) noexcept {}
+  _LIBCUDACXX_INLINE_VISIBILITY void operator|=(const __bitset&) noexcept {}
+  _LIBCUDACXX_INLINE_VISIBILITY void operator^=(const __bitset&) noexcept {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY void flip() noexcept {}
+
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const
+  {
+    return 0;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const
+  {
+    return 0;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY bool all() const noexcept
+  {
+    return true;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool any() const noexcept
+  {
+    return false;
+  }
+
+  _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept
+  {
+    return 0;
+  }
 };
 
-inline
-constexpr
-__bitset<0, 0>::__bitset() noexcept
-{
-}
+inline constexpr __bitset<0, 0>::__bitset() noexcept {}
 
-inline
-constexpr
-__bitset<0, 0>::__bitset(unsigned long long) noexcept
-{
-}
+inline constexpr __bitset<0, 0>::__bitset(unsigned long long) noexcept {}
 
-template <size_t _Size> class _LIBCUDACXX_TEMPLATE_VIS bitset;
-template <size_t _Size> struct hash<bitset<_Size> >;
+template <size_t _Size>
+class _LIBCUDACXX_TEMPLATE_VIS bitset;
+template <size_t _Size>
+struct hash<bitset<_Size>>;
 
 template <size_t _Size>
 class _LIBCUDACXX_TEMPLATE_VIS bitset
     : private __bitset<_Size == 0 ? 0 : (_Size - 1) / (sizeof(size_t) * CHAR_BIT) + 1, _Size>
 {
 public:
-    static const unsigned __n_words = _Size == 0 ? 0 : (_Size - 1) / (sizeof(size_t) * CHAR_BIT) + 1;
-    typedef __bitset<__n_words, _Size> base;
+  static const unsigned __n_words = _Size == 0 ? 0 : (_Size - 1) / (sizeof(size_t) * CHAR_BIT) + 1;
+  typedef __bitset<__n_words, _Size> base;
 
 public:
-    typedef typename base::reference       reference;
-    typedef typename base::const_reference const_reference;
-
-    // 23.3.5.1 constructors:
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset() noexcept {}
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-        bitset(unsigned long long __v) noexcept : base(__v) {}
-    template<class _CharT, class = _EnableIf<_IsCharLikeType<_CharT>::value> >
-        explicit bitset(const _CharT* __str,
-                        typename basic_string<_CharT>::size_type __n = basic_string<_CharT>::npos,
-                        _CharT __zero = _CharT('0'), _CharT __one = _CharT('1'));
-    template<class _CharT, class _Traits, class _Allocator>
-        explicit bitset(const basic_string<_CharT,_Traits,_Allocator>& __str,
-                        typename basic_string<_CharT,_Traits,_Allocator>::size_type __pos = 0,
-                        typename basic_string<_CharT,_Traits,_Allocator>::size_type __n =
-                                (basic_string<_CharT,_Traits,_Allocator>::npos),
-                        _CharT __zero = _CharT('0'), _CharT __one = _CharT('1'));
-
-    // 23.3.5.2 bitset operations:
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset& operator&=(const bitset& __rhs) noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset& operator|=(const bitset& __rhs) noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset& operator^=(const bitset& __rhs) noexcept;
-    bitset& operator<<=(size_t __pos) noexcept;
-    bitset& operator>>=(size_t __pos) noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset& set() noexcept;
-    bitset& set(size_t __pos, bool __val = true);
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset& reset() noexcept;
-    bitset& reset(size_t __pos);
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset  operator~() const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset& flip() noexcept;
-    bitset& flip(size_t __pos);
-
-    // element access:
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr
-                              const_reference operator[](size_t __p) const {return base::__make_ref(__p);}
-    _LIBCUDACXX_INLINE_VISIBILITY       reference operator[](size_t __p)       {return base::__make_ref(__p);}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    unsigned long to_ulong() const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    unsigned long long to_ullong() const;
-    template <class _CharT, class _Traits, class _Allocator>
-        basic_string<_CharT, _Traits, _Allocator> to_string(_CharT __zero = _CharT('0'),
-                                                            _CharT __one = _CharT('1')) const;
-    template <class _CharT, class _Traits>
-        _LIBCUDACXX_INLINE_VISIBILITY
-        basic_string<_CharT, _Traits, allocator<_CharT> > to_string(_CharT __zero = _CharT('0'),
-                                                                    _CharT __one = _CharT('1')) const;
-    template <class _CharT>
-        _LIBCUDACXX_INLINE_VISIBILITY
-        basic_string<_CharT, char_traits<_CharT>, allocator<_CharT> > to_string(_CharT __zero = _CharT('0'),
-                                                                                _CharT __one = _CharT('1')) const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    basic_string<char, char_traits<char>, allocator<char> > to_string(char __zero = '0',
-                                                                      char __one = '1') const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    size_t count() const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t size() const noexcept {return _Size;}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool operator==(const bitset& __rhs) const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool operator!=(const bitset& __rhs) const noexcept;
-    bool test(size_t __pos) const;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool all() const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bool any() const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY bool none() const noexcept {return !any();}
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset operator<<(size_t __pos) const noexcept;
-    _LIBCUDACXX_INLINE_VISIBILITY
-    bitset operator>>(size_t __pos) const noexcept;
+  typedef typename base::reference reference;
+  typedef typename base::const_reference const_reference;
+
+  // 23.3.5.1 constructors:
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset() noexcept {}
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr bitset(unsigned long long __v) noexcept
+      : base(__v)
+  {}
+  template <class _CharT, class = _EnableIf<_IsCharLikeType<_CharT>::value>>
+  explicit bitset(const _CharT* __str,
+                  typename basic_string<_CharT>::size_type __n = basic_string<_CharT>::npos,
+                  _CharT __zero                                = _CharT('0'),
+                  _CharT __one                                 = _CharT('1'));
+  template <class _CharT, class _Traits, class _Allocator>
+  explicit bitset(const basic_string<_CharT, _Traits, _Allocator>& __str,
+                  typename basic_string<_CharT, _Traits, _Allocator>::size_type __pos = 0,
+                  typename basic_string<_CharT, _Traits, _Allocator>::size_type __n =
+                    (basic_string<_CharT, _Traits, _Allocator>::npos),
+                  _CharT __zero = _CharT('0'),
+                  _CharT __one  = _CharT('1'));
+
+  // 23.3.5.2 bitset operations:
+  _LIBCUDACXX_INLINE_VISIBILITY bitset& operator&=(const bitset& __rhs) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bitset& operator|=(const bitset& __rhs) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bitset& operator^=(const bitset& __rhs) noexcept;
+  bitset& operator<<=(size_t __pos) noexcept;
+  bitset& operator>>=(size_t __pos) noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bitset& set() noexcept;
+  bitset& set(size_t __pos, bool __val = true);
+  _LIBCUDACXX_INLINE_VISIBILITY bitset& reset() noexcept;
+  bitset& reset(size_t __pos);
+  _LIBCUDACXX_INLINE_VISIBILITY bitset operator~() const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bitset& flip() noexcept;
+  bitset& flip(size_t __pos);
+
+  // element access:
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr const_reference operator[](size_t __p) const
+  {
+    return base::__make_ref(__p);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY reference operator[](size_t __p)
+  {
+    return base::__make_ref(__p);
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long to_ulong() const;
+  _LIBCUDACXX_INLINE_VISIBILITY unsigned long long to_ullong() const;
+  template <class _CharT, class _Traits, class _Allocator>
+  basic_string<_CharT, _Traits, _Allocator> to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const;
+  template <class _CharT, class _Traits>
+  _LIBCUDACXX_INLINE_VISIBILITY basic_string<_CharT, _Traits, allocator<_CharT>>
+  to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const;
+  template <class _CharT>
+  _LIBCUDACXX_INLINE_VISIBILITY basic_string<_CharT, char_traits<_CharT>, allocator<_CharT>>
+  to_string(_CharT __zero = _CharT('0'), _CharT __one = _CharT('1')) const;
+  _LIBCUDACXX_INLINE_VISIBILITY basic_string<char, char_traits<char>, allocator<char>>
+  to_string(char __zero = '0', char __one = '1') const;
+  _LIBCUDACXX_INLINE_VISIBILITY size_t count() const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY constexpr size_t size() const noexcept
+  {
+    return _Size;
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bool operator==(const bitset& __rhs) const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bool operator!=(const bitset& __rhs) const noexcept;
+  bool test(size_t __pos) const;
+  _LIBCUDACXX_INLINE_VISIBILITY bool all() const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bool any() const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bool none() const noexcept
+  {
+    return !any();
+  }
+  _LIBCUDACXX_INLINE_VISIBILITY bitset operator<<(size_t __pos) const noexcept;
+  _LIBCUDACXX_INLINE_VISIBILITY bitset operator>>(size_t __pos) const noexcept;
 
 private:
+  _LIBCUDACXX_INLINE_VISIBILITY size_t __hash_code() const noexcept
+  {
+    return base::__hash_code();
+  }
 
-    _LIBCUDACXX_INLINE_VISIBILITY
-    size_t __hash_code() const noexcept {return base::__hash_code();}
-
-    friend struct hash<bitset>;
+  friend struct hash<bitset>;
 };
 
 template <size_t _Size>
-template<class _CharT, class>
-bitset<_Size>::bitset(const _CharT* __str,
-                      typename basic_string<_CharT>::size_type __n,
-                      _CharT __zero, _CharT __one)
+template <class _CharT, class>
+bitset<_Size>::bitset(const _CharT* __str, typename basic_string<_CharT>::size_type __n, _CharT __zero, _CharT __one)
 {
-    size_t __rlen = _CUDA_VSTD::min(__n, char_traits<_CharT>::length(__str));
-    for (size_t __i = 0; __i < __rlen; ++__i)
-        if (__str[__i] != __zero && __str[__i] != __one)
-            __throw_invalid_argument("bitset string ctor has invalid argument");
-
-    size_t _Mp = _CUDA_VSTD::min(__rlen, _Size);
-    size_t __i = 0;
-    for (; __i < _Mp; ++__i)
+  size_t __rlen = _CUDA_VSTD::min(__n, char_traits<_CharT>::length(__str));
+  for (size_t __i = 0; __i < __rlen; ++__i)
+  {
+    if (__str[__i] != __zero && __str[__i] != __one)
     {
-        _CharT __c = __str[_Mp - 1 - __i];
-        if (__c == __zero)
-            (*this)[__i] = false;
-        else
-            (*this)[__i] = true;
+      __throw_invalid_argument("bitset string ctor has invalid argument");
     }
-    _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false);
+  }
+
+  size_t _Mp = _CUDA_VSTD::min(__rlen, _Size);
+  size_t __i = 0;
+  for (; __i < _Mp; ++__i)
+  {
+    _CharT __c = __str[_Mp - 1 - __i];
+    if (__c == __zero)
+    {
+      (*this)[__i] = false;
+    }
+    else
+    {
+      (*this)[__i] = true;
+    }
+  }
+  _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false);
 }
 
 template <size_t _Size>
-template<class _CharT, class _Traits, class _Allocator>
-bitset<_Size>::bitset(const basic_string<_CharT,_Traits,_Allocator>& __str,
-       typename basic_string<_CharT,_Traits,_Allocator>::size_type __pos,
-       typename basic_string<_CharT,_Traits,_Allocator>::size_type __n,
-       _CharT __zero, _CharT __one)
-{
-    if (__pos > __str.size())
-        __throw_out_of_range("bitset string pos out of range");
-
-    size_t __rlen = _CUDA_VSTD::min(__n, __str.size() - __pos);
-    for (size_t __i = __pos; __i < __pos + __rlen; ++__i)
-        if (!_Traits::eq(__str[__i], __zero) && !_Traits::eq(__str[__i], __one))
-            __throw_invalid_argument("bitset string ctor has invalid argument");
-
-    size_t _Mp = _CUDA_VSTD::min(__rlen, _Size);
-    size_t __i = 0;
-    for (; __i < _Mp; ++__i)
+template <class _CharT, class _Traits, class _Allocator>
+bitset<_Size>::bitset(
+  const basic_string<_CharT, _Traits, _Allocator>& __str,
+  typename basic_string<_CharT, _Traits, _Allocator>::size_type __pos,
+  typename basic_string<_CharT, _Traits, _Allocator>::size_type __n,
+  _CharT __zero,
+  _CharT __one)
+{
+  if (__pos > __str.size())
+  {
+    __throw_out_of_range("bitset string pos out of range");
+  }
+
+  size_t __rlen = _CUDA_VSTD::min(__n, __str.size() - __pos);
+  for (size_t __i = __pos; __i < __pos + __rlen; ++__i)
+  {
+    if (!_Traits::eq(__str[__i], __zero) && !_Traits::eq(__str[__i], __one))
+    {
+      __throw_invalid_argument("bitset string ctor has invalid argument");
+    }
+  }
+
+  size_t _Mp = _CUDA_VSTD::min(__rlen, _Size);
+  size_t __i = 0;
+  for (; __i < _Mp; ++__i)
+  {
+    _CharT __c = __str[__pos + _Mp - 1 - __i];
+    if (_Traits::eq(__c, __zero))
     {
-        _CharT __c = __str[__pos + _Mp - 1 - __i];
-        if (_Traits::eq(__c, __zero))
-            (*this)[__i] = false;
-        else
-            (*this)[__i] = true;
+      (*this)[__i] = false;
     }
-    _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false);
+    else
+    {
+      (*this)[__i] = true;
+    }
+  }
+  _CUDA_VSTD::fill(base::__make_iter(__i), base::__make_iter(_Size), false);
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>&
-bitset<_Size>::operator&=(const bitset& __rhs) noexcept
+inline bitset<_Size>& bitset<_Size>::operator&=(const bitset& __rhs) noexcept
 {
-    base::operator&=(__rhs);
-    return *this;
+  base::operator&=(__rhs);
+  return *this;
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>&
-bitset<_Size>::operator|=(const bitset& __rhs) noexcept
+inline bitset<_Size>& bitset<_Size>::operator|=(const bitset& __rhs) noexcept
 {
-    base::operator|=(__rhs);
-    return *this;
+  base::operator|=(__rhs);
+  return *this;
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>&
-bitset<_Size>::operator^=(const bitset& __rhs) noexcept
+inline bitset<_Size>& bitset<_Size>::operator^=(const bitset& __rhs) noexcept
 {
-    base::operator^=(__rhs);
-    return *this;
+  base::operator^=(__rhs);
+  return *this;
 }
 
 template <size_t _Size>
-bitset<_Size>&
-bitset<_Size>::operator<<=(size_t __pos) noexcept
+bitset<_Size>& bitset<_Size>::operator<<=(size_t __pos) noexcept
 {
-    __pos = _CUDA_VSTD::min(__pos, _Size);
-    _CUDA_VSTD::copy_backward(base::__make_iter(0), base::__make_iter(_Size - __pos), base::__make_iter(_Size));
-    _CUDA_VSTD::fill_n(base::__make_iter(0), __pos, false);
-    return *this;
+  __pos = _CUDA_VSTD::min(__pos, _Size);
+  _CUDA_VSTD::copy_backward(base::__make_iter(0), base::__make_iter(_Size - __pos), base::__make_iter(_Size));
+  _CUDA_VSTD::fill_n(base::__make_iter(0), __pos, false);
+  return *this;
 }
 
 template <size_t _Size>
-bitset<_Size>&
-bitset<_Size>::operator>>=(size_t __pos) noexcept
+bitset<_Size>& bitset<_Size>::operator>>=(size_t __pos) noexcept
 {
-    __pos = _CUDA_VSTD::min(__pos, _Size);
-    _CUDA_VSTD::copy(base::__make_iter(__pos), base::__make_iter(_Size), base::__make_iter(0));
-    _CUDA_VSTD::fill_n(base::__make_iter(_Size - __pos), __pos, false);
-    return *this;
+  __pos = _CUDA_VSTD::min(__pos, _Size);
+  _CUDA_VSTD::copy(base::__make_iter(__pos), base::__make_iter(_Size), base::__make_iter(0));
+  _CUDA_VSTD::fill_n(base::__make_iter(_Size - __pos), __pos, false);
+  return *this;
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>&
-bitset<_Size>::set() noexcept
+inline bitset<_Size>& bitset<_Size>::set() noexcept
 {
-    _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, true);
-    return *this;
+  _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, true);
+  return *this;
 }
 
 template <size_t _Size>
-bitset<_Size>&
-bitset<_Size>::set(size_t __pos, bool __val)
+bitset<_Size>& bitset<_Size>::set(size_t __pos, bool __val)
 {
-    if (__pos >= _Size)
-        __throw_out_of_range("bitset set argument out of range");
+  if (__pos >= _Size)
+  {
+    __throw_out_of_range("bitset set argument out of range");
+  }
 
-    (*this)[__pos] = __val;
-    return *this;
+  (*this)[__pos] = __val;
+  return *this;
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>&
-bitset<_Size>::reset() noexcept
+inline bitset<_Size>& bitset<_Size>::reset() noexcept
 {
-    _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, false);
-    return *this;
+  _CUDA_VSTD::fill_n(base::__make_iter(0), _Size, false);
+  return *this;
 }
 
 template <size_t _Size>
-bitset<_Size>&
-bitset<_Size>::reset(size_t __pos)
+bitset<_Size>& bitset<_Size>::reset(size_t __pos)
 {
-    if (__pos >= _Size)
-        __throw_out_of_range("bitset reset argument out of range");
+  if (__pos >= _Size)
+  {
+    __throw_out_of_range("bitset reset argument out of range");
+  }
 
-    (*this)[__pos] = false;
-    return *this;
+  (*this)[__pos] = false;
+  return *this;
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>
-bitset<_Size>::operator~() const noexcept
+inline bitset<_Size> bitset<_Size>::operator~() const noexcept
 {
-    bitset __x(*this);
-    __x.flip();
-    return __x;
+  bitset __x(*this);
+  __x.flip();
+  return __x;
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>&
-bitset<_Size>::flip() noexcept
+inline bitset<_Size>& bitset<_Size>::flip() noexcept
 {
-    base::flip();
-    return *this;
+  base::flip();
+  return *this;
 }
 
 template <size_t _Size>
-bitset<_Size>&
-bitset<_Size>::flip(size_t __pos)
+bitset<_Size>& bitset<_Size>::flip(size_t __pos)
 {
-    if (__pos >= _Size)
-        __throw_out_of_range("bitset flip argument out of range");
+  if (__pos >= _Size)
+  {
+    __throw_out_of_range("bitset flip argument out of range");
+  }
 
-    reference r = base::__make_ref(__pos);
-    r = ~r;
-    return *this;
+  reference r = base::__make_ref(__pos);
+  r           = ~r;
+  return *this;
 }
 
 template <size_t _Size>
-inline
-unsigned long
-bitset<_Size>::to_ulong() const
+inline unsigned long bitset<_Size>::to_ulong() const
 {
-    return base::to_ulong();
+  return base::to_ulong();
 }
 
 template <size_t _Size>
-inline
-unsigned long long
-bitset<_Size>::to_ullong() const
+inline unsigned long long bitset<_Size>::to_ullong() const
 {
-    return base::to_ullong();
+  return base::to_ullong();
 }
 
 template <size_t _Size>
 template <class _CharT, class _Traits, class _Allocator>
-basic_string<_CharT, _Traits, _Allocator>
-bitset<_Size>::to_string(_CharT __zero, _CharT __one) const
+basic_string<_CharT, _Traits, _Allocator> bitset<_Size>::to_string(_CharT __zero, _CharT __one) const
 {
-    basic_string<_CharT, _Traits, _Allocator> __r(_Size, __zero);
-    for (size_t __i = 0; __i < _Size; ++__i)
+  basic_string<_CharT, _Traits, _Allocator> __r(_Size, __zero);
+  for (size_t __i = 0; __i < _Size; ++__i)
+  {
+    if ((*this)[__i])
     {
-        if ((*this)[__i])
-            __r[_Size - 1 - __i] = __one;
+      __r[_Size - 1 - __i] = __one;
     }
-    return __r;
+  }
+  return __r;
 }
 
 template <size_t _Size>
 template <class _CharT, class _Traits>
-inline
-basic_string<_CharT, _Traits, allocator<_CharT> >
-bitset<_Size>::to_string(_CharT __zero, _CharT __one) const
+inline basic_string<_CharT, _Traits, allocator<_CharT>> bitset<_Size>::to_string(_CharT __zero, _CharT __one) const
 {
-    return to_string<_CharT, _Traits, allocator<_CharT> >(__zero, __one);
+  return to_string<_CharT, _Traits, allocator<_CharT>>(__zero, __one);
 }
 
 template <size_t _Size>
 template <class _CharT>
-inline
-basic_string<_CharT, char_traits<_CharT>, allocator<_CharT> >
+inline basic_string<_CharT, char_traits<_CharT>, allocator<_CharT>>
 bitset<_Size>::to_string(_CharT __zero, _CharT __one) const
 {
-    return to_string<_CharT, char_traits<_CharT>, allocator<_CharT> >(__zero, __one);
+  return to_string<_CharT, char_traits<_CharT>, allocator<_CharT>>(__zero, __one);
 }
 
 template <size_t _Size>
-inline
-basic_string<char, char_traits<char>, allocator<char> >
-bitset<_Size>::to_string(char __zero, char __one) const
+inline basic_string<char, char_traits<char>, allocator<char>> bitset<_Size>::to_string(char __zero, char __one) const
 {
-    return to_string<char, char_traits<char>, allocator<char> >(__zero, __one);
+  return to_string<char, char_traits<char>, allocator<char>>(__zero, __one);
 }
 
 template <size_t _Size>
-inline
-size_t
-bitset<_Size>::count() const noexcept
+inline size_t bitset<_Size>::count() const noexcept
 {
-    return static_cast<size_t>(__count_bool_true(base::__make_iter(0), _Size));
+  return static_cast<size_t>(__count_bool_true(base::__make_iter(0), _Size));
 }
 
 template <size_t _Size>
-inline
-bool
-bitset<_Size>::operator==(const bitset& __rhs) const noexcept
+inline bool bitset<_Size>::operator==(const bitset& __rhs) const noexcept
 {
-    return _CUDA_VSTD::equal(base::__make_iter(0), base::__make_iter(_Size), __rhs.__make_iter(0));
+  return _CUDA_VSTD::equal(base::__make_iter(0), base::__make_iter(_Size), __rhs.__make_iter(0));
 }
 
 template <size_t _Size>
-inline
-bool
-bitset<_Size>::operator!=(const bitset& __rhs) const noexcept
+inline bool bitset<_Size>::operator!=(const bitset& __rhs) const noexcept
 {
-    return !(*this == __rhs);
+  return !(*this == __rhs);
 }
 
 template <size_t _Size>
-bool
-bitset<_Size>::test(size_t __pos) const
+bool bitset<_Size>::test(size_t __pos) const
 {
-    if (__pos >= _Size)
-        __throw_out_of_range("bitset test argument out of range");
+  if (__pos >= _Size)
+  {
+    __throw_out_of_range("bitset test argument out of range");
+  }
 
-    return (*this)[__pos];
+  return (*this)[__pos];
 }
 
 template <size_t _Size>
-inline
-bool
-bitset<_Size>::all() const noexcept
+inline bool bitset<_Size>::all() const noexcept
 {
-    return base::all();
+  return base::all();
 }
 
 template <size_t _Size>
-inline
-bool
-bitset<_Size>::any() const noexcept
+inline bool bitset<_Size>::any() const noexcept
 {
-    return base::any();
+  return base::any();
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>
-bitset<_Size>::operator<<(size_t __pos) const noexcept
+inline bitset<_Size> bitset<_Size>::operator<<(size_t __pos) const noexcept
 {
-    bitset __r = *this;
-    __r <<= __pos;
-    return __r;
+  bitset __r = *this;
+  __r <<= __pos;
+  return __r;
 }
 
 template <size_t _Size>
-inline
-bitset<_Size>
-bitset<_Size>::operator>>(size_t __pos) const noexcept
+inline bitset<_Size> bitset<_Size>::operator>>(size_t __pos) const noexcept
 {
-    bitset __r = *this;
-    __r >>= __pos;
-    return __r;
+  bitset __r = *this;
+  __r >>= __pos;
+  return __r;
 }
 
 template <size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bitset<_Size>
-operator&(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bitset<_Size> operator&(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
 {
-    bitset<_Size> __r = __x;
-    __r &= __y;
-    return __r;
+  bitset<_Size> __r = __x;
+  __r &= __y;
+  return __r;
 }
 
 template <size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bitset<_Size>
-operator|(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bitset<_Size> operator|(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
 {
-    bitset<_Size> __r = __x;
-    __r |= __y;
-    return __r;
+  bitset<_Size> __r = __x;
+  __r |= __y;
+  return __r;
 }
 
 template <size_t _Size>
-inline _LIBCUDACXX_INLINE_VISIBILITY
-bitset<_Size>
-operator^(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
+inline _LIBCUDACXX_INLINE_VISIBILITY bitset<_Size> operator^(const bitset<_Size>& __x, const bitset<_Size>& __y) noexcept
 {
-    bitset<_Size> __r = __x;
-    __r ^= __y;
-    return __r;
+  bitset<_Size> __r = __x;
+  __r ^= __y;
+  return __r;
 }
 
 template <size_t _Size>
-struct _LIBCUDACXX_TEMPLATE_VIS hash<bitset<_Size> >
-    : public __unary_function<bitset<_Size>, size_t>
+struct _LIBCUDACXX_TEMPLATE_VIS hash<bitset<_Size>> : public __unary_function<bitset<_Size>, size_t>
 {
-    _LIBCUDACXX_INLINE_VISIBILITY
-    size_t operator()(const bitset<_Size>& __bs) const noexcept
-        {return __bs.__hash_code();}
+  _LIBCUDACXX_INLINE_VISIBILITY size_t operator()(const bitset<_Size>& __bs) const noexcept
+  {
+    return __bs.__hash_code();
+  }
 };
 
 template <class _CharT, class _Traits, size_t _Size>
-basic_istream<_CharT, _Traits>&
-operator>>(basic_istream<_CharT, _Traits>& __is, bitset<_Size>& __x);
+basic_istream<_CharT, _Traits>& operator>>(basic_istream<_CharT, _Traits>& __is, bitset<_Size>& __x);
 
 template <class _CharT, class _Traits, size_t _Size>
-basic_ostream<_CharT, _Traits>&
-operator<<(basic_ostream<_CharT, _Traits>& __os, const bitset<_Size>& __x);
+basic_ostream<_CharT, _Traits>& operator<<(basic_ostream<_CharT, _Traits>& __os, const bitset<_Size>& __x);
 
 _LIBCUDACXX_END_NAMESPACE_STD
 
 _LIBCUDACXX_POP_MACROS
 
-#endif  // _LIBCUDACXX_BITSET
+#endif // _LIBCUDACXX_BITSET
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/concepts b/libcudacxx/include/cuda/std/detail/libcxx/include/concepts
index 15f041190c0..24995197262 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/concepts
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/concepts
@@ -140,7 +140,6 @@ namespace std {
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
 #include <cuda/std/__concepts/__concept_macros.h>
 #include <cuda/std/__concepts/_One_of.h>
 #include <cuda/std/__concepts/all_of.h>
@@ -166,7 +165,7 @@ namespace std {
 #include <cuda/std/__concepts/semiregular.h>
 #include <cuda/std/__concepts/swappable.h>
 #include <cuda/std/__concepts/totally_ordered.h>
-
+#include <cuda/std/detail/libcxx/include/__assert> // all public C++ headers provide the assertion handler
 #include <cuda/std/version>
 
 #endif // _LIBCUDACXX_CONCEPTS
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/version b/libcudacxx/include/cuda/std/detail/libcxx/include/version
index 9c81e18dcb2..08f33681920 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/version
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/version
@@ -10,7 +10,6 @@
 #ifndef _LIBCUDACXX_VERSIONH
 #define _LIBCUDACXX_VERSIONH
 
-
 /*
   version synopsis
 
@@ -212,131 +211,131 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // We need to define our own macros to not conflict with the host stl.
 // At the same time we want bring in all feature test macros from host
 #if __has_include(<version>) // <version> should be the smallest include possible
-#include <version>
+#  include <version>
 #elif !defined(_CCCL_COMPILER_NVRTC)
-#include <ciso646> // otherwise go for the smallest possible header
+#  include <ciso646> // otherwise go for the smallest possible header
 #endif
 
 #if _CCCL_STD_VER > 2011
-# define __cccl_lib_chrono_udls                          201304L
-# define __cccl_lib_complex_udls                         201309L
-#ifdef _LIBCUDACXX_IS_CONSTANT_EVALUATED
-# define __cccl_lib_constexpr_complex                    201711L
-#endif
-# define __cccl_lib_concepts                             202002L
-# define __cccl_lib_exchange_function                    201304L
-# define __cccl_lib_expected                             202211L
+#  define __cccl_lib_chrono_udls  201304L
+#  define __cccl_lib_complex_udls 201309L
+#  ifdef _LIBCUDACXX_IS_CONSTANT_EVALUATED
+#    define __cccl_lib_constexpr_complex 201711L
+#  endif
+#  define __cccl_lib_concepts          202002L
+#  define __cccl_lib_exchange_function 201304L
+#  define __cccl_lib_expected          202211L
 // # define __cccl_lib_generic_associative_lookup           201304L
-# define __cccl_lib_integer_sequence                     201304L
-# define __cccl_lib_integral_constant_callable           201304L
-# define __cccl_lib_is_final                             201402L
-# define __cccl_lib_is_null_pointer                      201309L
-# define __cccl_lib_make_reverse_iterator                201402L
+#  define __cccl_lib_integer_sequence           201304L
+#  define __cccl_lib_integral_constant_callable 201304L
+#  define __cccl_lib_is_final                   201402L
+#  define __cccl_lib_is_null_pointer            201309L
+#  define __cccl_lib_make_reverse_iterator      201402L
 // # define __cccl_lib_make_unique                          201304L
-# define __cccl_lib_null_iterators                       201304L
-# define __cccl_lib_optional                             202110L
+#  define __cccl_lib_null_iterators 201304L
+#  define __cccl_lib_optional       202110L
 // # define __cccl_lib_quoted_string_io                     201304L
-# define __cccl_lib_result_of_sfinae                     201210L
-# define __cccl_lib_robust_nonmodifying_seq_ops          201304L
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS)
+#  define __cccl_lib_result_of_sfinae            201210L
+#  define __cccl_lib_robust_nonmodifying_seq_ops 201304L
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
 // #   define __cccl_lib_shared_timed_mutex                 201402L
-# endif
-# define __cccl_lib_span                                 202002L
+#  endif
+#  define __cccl_lib_span 202002L
 // # define __cccl_lib_string_udls                          201304L
-# define __cccl_lib_transformation_trait_aliases         201304L
-# define __cccl_lib_transparent_operators                201210L
-# define __cccl_lib_tuple_element_t                      201402L
-# define __cccl_lib_tuples_by_type                       201304L
+#  define __cccl_lib_transformation_trait_aliases 201304L
+#  define __cccl_lib_transparent_operators        201210L
+#  define __cccl_lib_tuple_element_t              201402L
+#  define __cccl_lib_tuples_by_type               201304L
 #endif // _CCCL_STD_VER > 2011
 
 #if _CCCL_STD_VER > 2014
-# if defined(_LIBCUDACXX_ADDRESSOF)
-#   define __cccl_lib_addressof_constexpr                201603L
-# endif
+#  if defined(_LIBCUDACXX_ADDRESSOF)
+#    define __cccl_lib_addressof_constexpr 201603L
+#  endif
 // # define __cccl_lib_allocator_traits_is_always_equal     201411L
 // # define __cccl_lib_any                                  201606L
-# define __cccl_lib_apply                                201603L
-# define __cccl_lib_array_constexpr                      201603L
-# define __cccl_lib_as_const                             201510L
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS)
-#   define __cccl_lib_atomic_is_always_lock_free         201603L
-# endif
-# define __cccl_lib_bind_front                           201907L
-# define __cccl_lib_bool_constant                        201505L
+#  define __cccl_lib_apply           201603L
+#  define __cccl_lib_array_constexpr 201603L
+#  define __cccl_lib_as_const        201510L
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
+#    define __cccl_lib_atomic_is_always_lock_free 201603L
+#  endif
+#  define __cccl_lib_bind_front    201907L
+#  define __cccl_lib_bool_constant 201505L
 // # define __cccl_lib_boyer_moore_searcher                 201603L
-# define __cccl_lib_byte                                 201603L
-# define __cccl_lib_chrono                               201611L
+#  define __cccl_lib_byte   201603L
+#  define __cccl_lib_chrono 201611L
 // # define __cccl_lib_clamp                                201603L
 // # define __cccl_lib_enable_shared_from_this              201603L
 // # define __cccl_lib_execution                            201603L
 // # define __cccl_lib_filesystem                           201703L
-# define __cccl_lib_gcd_lcm                              201606L
-# define __cccl_lib_hardware_interference_size           201703L
-# if defined(_LIBCUDACXX_HAS_UNIQUE_OBJECT_REPRESENTATIONS)
-#   define __cccl_lib_has_unique_object_representations  201606L
-# endif
-# define __cccl_lib_hypot                                201603L
+#  define __cccl_lib_gcd_lcm                    201606L
+#  define __cccl_lib_hardware_interference_size 201703L
+#  if defined(_LIBCUDACXX_HAS_UNIQUE_OBJECT_REPRESENTATIONS)
+#    define __cccl_lib_has_unique_object_representations 201606L
+#  endif
+#  define __cccl_lib_hypot 201603L
 // # define __cccl_lib_incomplete_container_elements        201505L
-# define __cccl_lib_invoke                               201411L
-# if !defined(_LIBCUDACXX_HAS_NO_IS_AGGREGATE)
-#   define __cccl_lib_is_aggregate                       201703L
-# endif
-# define __cccl_lib_is_invocable                         201703L
-# define __cccl_lib_is_swappable                         201603L
-# define __cccl_lib_launder                              201606L
-# define __cccl_lib_logical_traits                       201510L
-# define __cccl_lib_make_from_tuple                      201606L
+#  define __cccl_lib_invoke 201411L
+#  if !defined(_LIBCUDACXX_HAS_NO_IS_AGGREGATE)
+#    define __cccl_lib_is_aggregate 201703L
+#  endif
+#  define __cccl_lib_is_invocable    201703L
+#  define __cccl_lib_is_swappable    201603L
+#  define __cccl_lib_launder         201606L
+#  define __cccl_lib_logical_traits  201510L
+#  define __cccl_lib_make_from_tuple 201606L
 // # define __cccl_lib_map_try_emplace                      201411L
 // # define __cccl_lib_math_special_functions               201603L
 // # define __cccl_lib_memory_resource                      201603L
 // # define __cccl_lib_node_extract                         201606L
 // # define __cccl_lib_nonmember_container_access           201411L
-# define __cccl_lib_not_fn                               201603L
+#  define __cccl_lib_not_fn 201603L
 // # define __cccl_lib_parallel_algorithm                   201603L
 // # define __cccl_lib_raw_memory_algorithms                201606L
 // # define __cccl_lib_sample                               201603L
 // # define __cccl_lib_scoped_lock                          201703L
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS)
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
 // #   define __cccl_lib_shared_mutex                       201505L
-# endif
+#  endif
 // # define __cccl_lib_shared_ptr_arrays                    201611L
 // # define __cccl_lib_shared_ptr_weak_type                 201606L
 // # define __cccl_lib_string_view                          201606L
 // # define __cccl_lib_to_chars                             201611L
-# define __cccl_lib_type_trait_variable_templates        201510L
-# define __cccl_lib_uncaught_exceptions                  201411L
-# define __cccl_lib_unordered_map_try_emplace            201411L
-# define __cccl_lib_variant                              201606L
-# define __cccl_lib_void_t                               201411L
+#  define __cccl_lib_type_trait_variable_templates 201510L
+#  define __cccl_lib_uncaught_exceptions           201411L
+#  define __cccl_lib_unordered_map_try_emplace     201411L
+#  define __cccl_lib_variant                       201606L
+#  define __cccl_lib_void_t                        201411L
 #endif // _CCCL_STD_VER > 2014
 
 #if _CCCL_STD_VER > 2017
-# undef  __cccl_lib_array_constexpr
-# define __cccl_lib_array_constexpr                      201811L
+#  undef __cccl_lib_array_constexpr
+#  define __cccl_lib_array_constexpr 201811L
 // # define __cccl_lib_assume_aligned                       201811L
-# define __cccl_lib_atomic_flag_test                     201907L
-# define __cccl_lib_atomic_float                         201711L
-# define __cccl_lib_atomic_lock_free_type_aliases        201907L
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS)
-#   define __cccl_lib_atomic_ref                         201806L
-#endif
+#  define __cccl_lib_atomic_flag_test              201907L
+#  define __cccl_lib_atomic_float                  201711L
+#  define __cccl_lib_atomic_lock_free_type_aliases 201907L
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
+#    define __cccl_lib_atomic_ref 201806L
+#  endif
 // # define __cccl_lib_atomic_shared_ptr                    201711L
-# define __cccl_lib_atomic_value_initialization          201911L
-# if !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait)
-#   define __cccl_lib_atomic_wait                        201907L
-# endif
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier)
-#   define __cccl_lib_barrier                            201907L
-# endif
-# define __cccl_lib_bit_cast                             201806L
-# define __cccl_lib_bitops                               201907L
-# define __cccl_lib_bounded_array_traits                 201902L
-# if !defined(_LIBCUDACXX_NO_HAS_CHAR8_T)
-#   define __cccl_lib_char8_t                            201811L
-# endif
+#  define __cccl_lib_atomic_value_initialization 201911L
+#  if !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_atomic_wait)
+#    define __cccl_lib_atomic_wait 201907L
+#  endif
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_barrier)
+#    define __cccl_lib_barrier 201907L
+#  endif
+#  define __cccl_lib_bit_cast             201806L
+#  define __cccl_lib_bitops               201907L
+#  define __cccl_lib_bounded_array_traits 201902L
+#  if !defined(_LIBCUDACXX_NO_HAS_CHAR8_T)
+#    define __cccl_lib_char8_t 201811L
+#  endif
 // # define __cccl_lib_constexpr_algorithms                 201806L
 // # define __cccl_lib_constexpr_dynamic_alloc              201907L
-# define __cccl_lib_constexpr_functional                 201907L
+#  define __cccl_lib_constexpr_functional 201907L
 // # define __cccl_lib_constexpr_iterator                   201811L
 // # define __cccl_lib_constexpr_memory                     201811L
 // # define __cccl_lib_constexpr_misc                       201811L
@@ -348,40 +347,41 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cccl_lib_constexpr_utility                    201811L
 // # define __cccl_lib_constexpr_vector                     201907L
 // # define __cccl_lib_coroutine                            201902L
-# if defined(__cpp_impl_destroying_delete) && __cpp_impl_destroying_delete >= 201806L && defined(__cpp_lib_destroying_delete)
-#   define __cccl_lib_destroying_delete                  201806L
-# endif
+#  if defined(__cpp_impl_destroying_delete) && __cpp_impl_destroying_delete >= 201806L \
+    && defined(__cpp_lib_destroying_delete)
+#    define __cccl_lib_destroying_delete 201806L
+#  endif
 // # define __cccl_lib_endian                               201907L
 // # define __cccl_lib_erase_if                             201811L
 // # undef  __cccl_lib_execution
 // # define __cccl_lib_execution                            201902L
-# if !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_format) && !defined(_LIBCUDACXX_HAS_NO_INCOMPLETE_FORMAT)
+#  if !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_format) && !defined(_LIBCUDACXX_HAS_NO_INCOMPLETE_FORMAT)
 // #   define __cccl_lib_format                             202106L
-# endif
+#  endif
 // # define __cccl_lib_generic_unordered_lookup             201811L
 // # define __cccl_lib_int_pow2                             202002L
 // # define __cccl_lib_integer_comparison_functions         202002L
 // # define __cccl_lib_interpolate                          201902L
-# if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
-#   define __cccl_lib_is_constant_evaluated              201811L
-# endif
+#  if defined(_LIBCUDACXX_IS_CONSTANT_EVALUATED)
+#    define __cccl_lib_is_constant_evaluated 201811L
+#  endif
 // # define __cccl_lib_is_layout_compatible                 201907L
-# define __cccl_lib_is_nothrow_convertible               201806L
+#  define __cccl_lib_is_nothrow_convertible 201806L
 // # define __cccl_lib_is_pointer_interconvertible          201907L
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS)
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS)
 // #   define __cccl_lib_jthread                            201911L
-# endif
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_latch)
+#  endif
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_latch)
 // #   define __cccl_lib_latch                              201907L
-# endif
+#  endif
 // # define __cccl_lib_list_remove_return_type              201806L
 // # define __cccl_lib_math_constants                       201907L
 // # define __cccl_lib_polymorphic_allocator                201902L
 // # define __cccl_lib_ranges                               201811L
 // # define __cccl_lib_remove_cvref                         201711L
-# if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore)
+#  if !defined(_LIBCUDACXX_HAS_NO_THREADS) && !defined(_LIBCUDACXX_AVAILABILITY_DISABLE_FTM___cpp_lib_semaphore)
 // #   define __cccl_lib_semaphore                          201907L
-# endif
+#  endif
 // # undef  __cccl_lib_shared_ptr_arrays
 // # define __cccl_lib_shared_ptr_arrays                    201707L
 // # define __cccl_lib_shift                                201806L
@@ -396,7 +396,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cccl_lib_to_address                           201711L
 // # define __cccl_lib_to_array                             201907L
 // # define __cccl_lib_type_identity                        201806L
-# define __cccl_lib_unwrap_ref                           201811L
+#  define __cccl_lib_unwrap_ref 201811L
 #endif // _CCCL_STD_VER > 2017
 
 #if _CCCL_STD_VER > 2020
@@ -411,9 +411,9 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # undef  __cccl_lib_constexpr_memory
 // # define __cccl_lib_constexpr_memory                     202202L
 // # define __cccl_lib_constexpr_typeinfo                   202106L
-# define __cccl_lib_forward_like                         202207L
+#  define __cccl_lib_forward_like 202207L
 // # define __cccl_lib_invoke_r                             202106L
-# define __cccl_lib_is_scoped_enum                       202011L
+#  define __cccl_lib_is_scoped_enum 202011L
 // # define __cccl_lib_move_only_function                   202110L
 // # define __cccl_lib_out_ptr                              202106L
 // # define __cccl_lib_ranges_chunk                         202202L
@@ -430,8 +430,8 @@ __cpp_lib_void_t                                        201411L <type_traits>
 // # define __cccl_lib_stdatomic_h                          202011L
 // # define __cccl_lib_string_contains                      202011L
 // # define __cccl_lib_string_resize_and_overwrite          202110L
-# define __cccl_lib_to_underlying                        202102L
-# define __cccl_lib_unreachable                          202202L
+#  define __cccl_lib_to_underlying 202102L
+#  define __cccl_lib_unreachable   202202L
 
 #endif // _CCCL_STD_VER > 2020
 
diff --git a/libcudacxx/include/cuda/std/expected b/libcudacxx/include/cuda/std/expected
index 9469e699f34..81879381ad4 100644
--- a/libcudacxx/include/cuda/std/expected
+++ b/libcudacxx/include/cuda/std/expected
@@ -12,10 +12,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/expected>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif //_CUDA_STD_EXPECTED
diff --git a/libcudacxx/include/cuda/std/functional b/libcudacxx/include/cuda/std/functional
index 042e4b4b072..d9b2ccd2ff4 100644
--- a/libcudacxx/include/cuda/std/functional
+++ b/libcudacxx/include/cuda/std/functional
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/functional>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_FUNCTIONAL
diff --git a/libcudacxx/include/cuda/std/initializer_list b/libcudacxx/include/cuda/std/initializer_list
index 24296620e85..4224f9a7d5a 100644
--- a/libcudacxx/include/cuda/std/initializer_list
+++ b/libcudacxx/include/cuda/std/initializer_list
@@ -12,10 +12,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/initializer_list>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_INITIALIZER_LIST
diff --git a/libcudacxx/include/cuda/std/iterator b/libcudacxx/include/cuda/std/iterator
index 08568f18628..01ee8962240 100644
--- a/libcudacxx/include/cuda/std/iterator
+++ b/libcudacxx/include/cuda/std/iterator
@@ -12,10 +12,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/iterator>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_ITERATOR
diff --git a/libcudacxx/include/cuda/std/latch b/libcudacxx/include/cuda/std/latch
index fde9078ab4f..d3dfaf35f46 100644
--- a/libcudacxx/include/cuda/std/latch
+++ b/libcudacxx/include/cuda/std/latch
@@ -13,14 +13,12 @@
 #endif
 
 #ifndef _CUDA_STD_LATCH
-#define _CUDA_STD_LATCH
+#  define _CUDA_STD_LATCH
 
-#include <cuda/std/detail/__config>
+#  include <cuda/std/detail/__config>
 
-#include <cuda/std/detail/__pragma_push>
-
-#include <cuda/std/detail/libcxx/include/latch>
-
-#include <cuda/std/detail/__pragma_pop>
+#  include <cuda/std/detail/__pragma_pop>
+#  include <cuda/std/detail/__pragma_push>
+#  include <cuda/std/detail/libcxx/include/latch>
 
 #endif // _CUDA_STD_LATCH
diff --git a/libcudacxx/include/cuda/std/limits b/libcudacxx/include/cuda/std/limits
index 16531da3da0..c48b86a5e7d 100644
--- a/libcudacxx/include/cuda/std/limits
+++ b/libcudacxx/include/cuda/std/limits
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/limits>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_LIMITS
diff --git a/libcudacxx/include/cuda/std/mdspan b/libcudacxx/include/cuda/std/mdspan
index e9522897ca0..b5fa0ec9506 100644
--- a/libcudacxx/include/cuda/std/mdspan
+++ b/libcudacxx/include/cuda/std/mdspan
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/mdspan>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_MDSPAN
diff --git a/libcudacxx/include/cuda/std/optional b/libcudacxx/include/cuda/std/optional
index 5ecee7594fb..e89476c2737 100644
--- a/libcudacxx/include/cuda/std/optional
+++ b/libcudacxx/include/cuda/std/optional
@@ -12,10 +12,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/optional>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_OPTIONAL
diff --git a/libcudacxx/include/cuda/std/ranges b/libcudacxx/include/cuda/std/ranges
index 56a06f65071..54672905285 100644
--- a/libcudacxx/include/cuda/std/ranges
+++ b/libcudacxx/include/cuda/std/ranges
@@ -12,10 +12,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/ranges>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif //_CUDA_RANGES
diff --git a/libcudacxx/include/cuda/std/ratio b/libcudacxx/include/cuda/std/ratio
index 97425f38d1e..8ebde7c6f51 100644
--- a/libcudacxx/include/cuda/std/ratio
+++ b/libcudacxx/include/cuda/std/ratio
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/ratio>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_RATIO
diff --git a/libcudacxx/include/cuda/std/semaphore b/libcudacxx/include/cuda/std/semaphore
index 5b7efef48a3..645fce15fa0 100644
--- a/libcudacxx/include/cuda/std/semaphore
+++ b/libcudacxx/include/cuda/std/semaphore
@@ -13,14 +13,12 @@
 #endif
 
 #ifndef _CUDA_STD_SEMAPHORE
-#define _CUDA_STD_SEMAPHORE
+#  define _CUDA_STD_SEMAPHORE
 
-#include <cuda/std/detail/__config>
+#  include <cuda/std/detail/__config>
 
-#include <cuda/std/detail/__pragma_push>
-
-#include <cuda/std/detail/libcxx/include/semaphore>
-
-#include <cuda/std/detail/__pragma_pop>
+#  include <cuda/std/detail/__pragma_pop>
+#  include <cuda/std/detail/__pragma_push>
+#  include <cuda/std/detail/libcxx/include/semaphore>
 
 #endif // _CUDA_STD_SEMAPHORE
diff --git a/libcudacxx/include/cuda/std/span b/libcudacxx/include/cuda/std/span
index 0388da66871..e3592da7e16 100644
--- a/libcudacxx/include/cuda/std/span
+++ b/libcudacxx/include/cuda/std/span
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/span>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_SPAN
diff --git a/libcudacxx/include/cuda/std/tuple b/libcudacxx/include/cuda/std/tuple
index ee870be346c..5954f9f1878 100644
--- a/libcudacxx/include/cuda/std/tuple
+++ b/libcudacxx/include/cuda/std/tuple
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/tuple>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_TUPLE
diff --git a/libcudacxx/include/cuda/std/type_traits b/libcudacxx/include/cuda/std/type_traits
index 9eee9b7830a..32f2aa0037c 100644
--- a/libcudacxx/include/cuda/std/type_traits
+++ b/libcudacxx/include/cuda/std/type_traits
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/type_traits>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_TYPE_TRAITS
diff --git a/libcudacxx/include/cuda/std/utility b/libcudacxx/include/cuda/std/utility
index de2b78ca814..09291daf0a1 100644
--- a/libcudacxx/include/cuda/std/utility
+++ b/libcudacxx/include/cuda/std/utility
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/utility>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_UTILITY
diff --git a/libcudacxx/include/cuda/std/variant b/libcudacxx/include/cuda/std/variant
index 28d59fc012b..d1e4ca8e83d 100644
--- a/libcudacxx/include/cuda/std/variant
+++ b/libcudacxx/include/cuda/std/variant
@@ -12,10 +12,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/variant>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif //_CUDA_STD_VARIANT
diff --git a/libcudacxx/include/cuda/std/version b/libcudacxx/include/cuda/std/version
index 2d0cbbe9aab..fddca30c2ce 100644
--- a/libcudacxx/include/cuda/std/version
+++ b/libcudacxx/include/cuda/std/version
@@ -13,10 +13,8 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/std/detail/__pragma_pop>
 #include <cuda/std/detail/__pragma_push>
-
 #include <cuda/std/detail/libcxx/include/version>
 
-#include <cuda/std/detail/__pragma_pop>
-
 #endif // _CUDA_STD_VERSION
diff --git a/libcudacxx/include/cuda/stream_ref b/libcudacxx/include/cuda/stream_ref
index 5c2ef3c3d8b..d36c2246550 100644
--- a/libcudacxx/include/cuda/stream_ref
+++ b/libcudacxx/include/cuda/stream_ref
@@ -38,9 +38,8 @@ private:
 }  // cuda
 */
 
-#include <cuda_runtime_api.h> // cuda_runtime_api needs to come first
-
 #include <cuda/std/detail/__config>
+#include <cuda_runtime_api.h> // cuda_runtime_api needs to come first
 
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
@@ -50,9 +49,9 @@ private:
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/cstddef>
 #include <cuda/std/__cuda/api_wrapper.h>
 #include <cuda/std/__exception/cuda_error.h>
+#include <cuda/std/cstddef>
 
 _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
@@ -127,7 +126,10 @@ public:
   }
 
   /// Returns the wrapped `cudaStream_t` handle.
-  _CCCL_NODISCARD constexpr value_type get() const noexcept { return __stream; }
+  _CCCL_NODISCARD constexpr value_type get() const noexcept
+  {
+    return __stream;
+  }
 
   /**
    * \brief Synchronizes the wrapped stream.
@@ -150,7 +152,8 @@ public:
   _CCCL_NODISCARD bool ready() const
   {
     const auto __result = ::cudaStreamQuery(get());
-    if (__result == ::cudaErrorNotReady) {
+    if (__result == ::cudaErrorNotReady)
+    {
       return false;
     }
     switch (__result)
diff --git a/libcudacxx/include/nv/detail/__preprocessor b/libcudacxx/include/nv/detail/__preprocessor
index af9382bd13a..b73579246c3 100644
--- a/libcudacxx/include/nv/detail/__preprocessor
+++ b/libcudacxx/include/nv/detail/__preprocessor
@@ -9,7 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #if defined(__GNUC__)
-#pragma GCC system_header
+#  pragma GCC system_header
 #endif
 
 // For all compilers and dialects this header defines:
@@ -24,95 +24,153 @@
 
 #if defined(_NV_TARGET_CPP11)
 #  define _NV_EVAL1(...) __VA_ARGS__
-#  define _NV_EVAL(...) _NV_EVAL1(__VA_ARGS__)
+#  define _NV_EVAL(...)  _NV_EVAL1(__VA_ARGS__)
 #else
 #  define _NV_EVAL1(x) x
-#  define _NV_EVAL(x) _NV_EVAL1(x)
+#  define _NV_EVAL(x)  _NV_EVAL1(x)
 #endif // C++11
 
-#define _NV_CONCAT_EVAL1(l, r) _NV_EVAL(l ## r)
-#define _NV_CONCAT_EVAL(l, r) _NV_CONCAT_EVAL1(l, r)
+#define _NV_CONCAT_EVAL1(l, r) _NV_EVAL(l##r)
+#define _NV_CONCAT_EVAL(l, r)  _NV_CONCAT_EVAL1(l, r)
 
 #define _NV_IF_0(t, f) f
 #define _NV_IF_1(t, f) t
 
-#define _NV_IF_BIT(b) _NV_EVAL(_NV_IF_##b)
-#define _NV_IF__EVAL(fn, t, f) _NV_EVAL(fn(t, f))
+#define _NV_IF_BIT(b)           _NV_EVAL(_NV_IF_##b)
+#define _NV_IF__EVAL(fn, t, f)  _NV_EVAL(fn(t, f))
 #define _NV_IF_EVAL(cond, t, f) _NV_IF__EVAL(_NV_IF_BIT(cond), t, f)
 
 #define _NV_IF1(cond, t, f) _NV_IF_EVAL(cond, t, f)
-#define _NV_IF(cond, t, f) _NV_IF1(_NV_EVAL(cond), _NV_EVAL(t), _NV_EVAL(f))
+#define _NV_IF(cond, t, f)  _NV_IF1(_NV_EVAL(cond), _NV_EVAL(t), _NV_EVAL(f))
 
 #if defined(_NV_TARGET_CPP11)
 
 // The below mechanisms were derived from: https://gustedt.wordpress.com/2010/06/08/detect-empty-macro-arguments/
 
-#define _NV_ARG32(...) _NV_EVAL(_NV_ARG32_0(__VA_ARGS__))
-#define _NV_ARG32_0(                                                                         \
-    _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15,                    \
-    _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, ...) _31
-
-#define _NV_HAS_COMMA(...) _NV_ARG32(__VA_ARGS__,   \
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)
-
-#define _NV_TRIGGER_PARENTHESIS_(...) ,
-
-#define _NV_ISEMPTY(...)                                                    \
-    _NV_ISEMPTY0(                                                           \
-          /* test if there is just one argument, eventually an empty        \
-             one */                                                         \
-          _NV_EVAL(_NV_HAS_COMMA(__VA_ARGS__)),                                       \
-          /* test if _TRIGGER_PARENTHESIS_ together with the argument       \
-             adds a comma */                                                \
-          _NV_EVAL(_NV_HAS_COMMA(_NV_TRIGGER_PARENTHESIS_ __VA_ARGS__)),              \
-          /* test if the argument together with a parenthesis               \
-             adds a comma */                                                \
-          _NV_EVAL(_NV_HAS_COMMA(__VA_ARGS__ (/*empty*/))),                           \
-          /* test if placing it between _TRIGGER_PARENTHESIS_ and the       \
-             parenthesis adds a comma */                                    \
-          _NV_EVAL(_NV_HAS_COMMA(_NV_TRIGGER_PARENTHESIS_ __VA_ARGS__ (/*empty*/)))   \
-          )
-
-#define _NV_PASTE5(_0, _1, _2, _3, _4) _0 ## _1 ## _2 ## _3 ## _4
-#define _NV_ISEMPTY0(_0, _1, _2, _3) _NV_HAS_COMMA(_NV_PASTE5(_NV_IS_EMPTY_CASE_, _0, _1, _2, _3))
-#define _NV_IS_EMPTY_CASE_0001 ,
-
-
-#define _NV_REMOVE_PAREN(...) _NV_REMOVE_PAREN1(__VA_ARGS__)
-#define _NV_REMOVE_PAREN1(...) _NV_STRIP_PAREN(_NV_IF(_NV_TEST_PAREN(__VA_ARGS__), (_NV_STRIP_PAREN(__VA_ARGS__)), (__VA_ARGS__)))
-
-#define _NV_STRIP_PAREN2(...) __VA_ARGS__
-#define _NV_STRIP_PAREN1(...) _NV_STRIP_PAREN2 __VA_ARGS__
-#define _NV_STRIP_PAREN(...) _NV_STRIP_PAREN1(__VA_ARGS__)
-
-#define _NV_TEST_PAREN(...) _NV_TEST_PAREN1(__VA_ARGS__)
-#define _NV_TEST_PAREN1(...) _NV_TEST_PAREN2(_NV_TEST_PAREN_DUMMY __VA_ARGS__)
-#define _NV_TEST_PAREN2(...) _NV_TEST_PAREN3(_NV_CONCAT_EVAL(_, __VA_ARGS__))
-#define _NV_TEST_PAREN3(...) _NV_EVAL(_NV_FIRST_ARG(__VA_ARGS__))
-
-#define __NV_PAREN_YES 1
-#define __NV_PAREN_NO 0
-
-#define _NV_TEST_PAREN_DUMMY(...) _NV_PAREN_YES
-#define __NV_TEST_PAREN_DUMMY     __NV_PAREN_NO,
-
-#define _NV_FIRST_ARG1(x, ...) x
-#define _NV_FIRST_ARG(x, ...) _NV_FIRST_ARG1(x)
-
-#define _NV_REMOVE_FIRST_ARGS1(...) __VA_ARGS__
-#define _NV_REMOVE_FIRST_ARGS(x, ...) _NV_REMOVE_FIRST_ARGS1(__VA_ARGS__)
-
-#define _NV_NUM_ARGS(...) _NV_NUM_ARGS0(__VA_ARGS__)
-#define _NV_NUM_ARGS0(...) _NV_EVAL(_NV_NUM_ARGS1(__VA_ARGS__))
-#define _NV_NUM_ARGS1(...) _NV_IF(_NV_ISEMPTY(__VA_ARGS__), 0, _NV_NUM_ARGS2(__VA_ARGS__))
-#define _NV_NUM_ARGS2(...) _NV_ARG32(__VA_ARGS__,   \
-    31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16, \
-    15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
-
-#define _NV_DISPATCH_N_IMPL1(name, ...) _NV_EVAL(name(__VA_ARGS__))
-#define _NV_DISPATCH_N_IMPL0(depth, name, ...) _NV_DISPATCH_N_IMPL1(_NV_CONCAT_EVAL(name, depth), __VA_ARGS__)
-#define _NV_DISPATCH_N_IMPL(name, ...) _NV_DISPATCH_N_IMPL0(_NV_NUM_ARGS(__VA_ARGS__), name, __VA_ARGS__)
-#define _NV_DISPATCH_N_ARY(name, ...) _NV_DISPATCH_N_IMPL(name, __VA_ARGS__)
+#  define _NV_ARG32(...) _NV_EVAL(_NV_ARG32_0(__VA_ARGS__))
+#  define _NV_ARG32_0( \
+    _0,                \
+    _1,                \
+    _2,                \
+    _3,                \
+    _4,                \
+    _5,                \
+    _6,                \
+    _7,                \
+    _8,                \
+    _9,                \
+    _10,               \
+    _11,               \
+    _12,               \
+    _13,               \
+    _14,               \
+    _15,               \
+    _16,               \
+    _17,               \
+    _18,               \
+    _19,               \
+    _20,               \
+    _21,               \
+    _22,               \
+    _23,               \
+    _24,               \
+    _25,               \
+    _26,               \
+    _27,               \
+    _28,               \
+    _29,               \
+    _30,               \
+    _31,               \
+    ...)               \
+    _31
+
+#  define _NV_HAS_COMMA(...) \
+    _NV_ARG32(__VA_ARGS__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)
+
+#  define _NV_TRIGGER_PARENTHESIS_(...) ,
+
+#  define _NV_ISEMPTY(...)                                                                                            \
+    _NV_ISEMPTY0(/* test if there is just one argument, eventually an empty                                           \
+                    one */                                                                                            \
+                 _NV_EVAL(_NV_HAS_COMMA(__VA_ARGS__)), /* test if _TRIGGER_PARENTHESIS_ together with the argument    \
+                                                          adds a comma */                                             \
+                 _NV_EVAL(_NV_HAS_COMMA(_NV_TRIGGER_PARENTHESIS_ __VA_ARGS__)), /* test if the argument together with \
+                                                                                   a parenthesis adds a comma */                                                                                                      \
+                 _NV_EVAL(_NV_HAS_COMMA(__VA_ARGS__(/*empty*/))), /* test if placing it between _TRIGGER_PARENTHESIS_ \
+                                                                     and the parenthesis adds a comma */                                                                                          \
+                 _NV_EVAL(_NV_HAS_COMMA(_NV_TRIGGER_PARENTHESIS_ __VA_ARGS__(/*empty*/))))
+
+#  define _NV_PASTE5(_0, _1, _2, _3, _4) _0##_1##_2##_3##_4
+#  define _NV_ISEMPTY0(_0, _1, _2, _3)   _NV_HAS_COMMA(_NV_PASTE5(_NV_IS_EMPTY_CASE_, _0, _1, _2, _3))
+#  define _NV_IS_EMPTY_CASE_0001         ,
+
+#  define _NV_REMOVE_PAREN(...) _NV_REMOVE_PAREN1(__VA_ARGS__)
+#  define _NV_REMOVE_PAREN1(...) \
+    _NV_STRIP_PAREN(_NV_IF(_NV_TEST_PAREN(__VA_ARGS__), (_NV_STRIP_PAREN(__VA_ARGS__)), (__VA_ARGS__)))
+
+#  define _NV_STRIP_PAREN2(...) __VA_ARGS__
+#  define _NV_STRIP_PAREN1(...) _NV_STRIP_PAREN2 __VA_ARGS__
+#  define _NV_STRIP_PAREN(...)  _NV_STRIP_PAREN1(__VA_ARGS__)
+
+#  define _NV_TEST_PAREN(...)  _NV_TEST_PAREN1(__VA_ARGS__)
+#  define _NV_TEST_PAREN1(...) _NV_TEST_PAREN2(_NV_TEST_PAREN_DUMMY __VA_ARGS__)
+#  define _NV_TEST_PAREN2(...) _NV_TEST_PAREN3(_NV_CONCAT_EVAL(_, __VA_ARGS__))
+#  define _NV_TEST_PAREN3(...) _NV_EVAL(_NV_FIRST_ARG(__VA_ARGS__))
+
+#  define __NV_PAREN_YES 1
+#  define __NV_PAREN_NO  0
+
+#  define _NV_TEST_PAREN_DUMMY(...) _NV_PAREN_YES
+#  define __NV_TEST_PAREN_DUMMY     __NV_PAREN_NO,
+
+#  define _NV_FIRST_ARG1(x, ...) x
+#  define _NV_FIRST_ARG(x, ...)  _NV_FIRST_ARG1(x)
+
+#  define _NV_REMOVE_FIRST_ARGS1(...)   __VA_ARGS__
+#  define _NV_REMOVE_FIRST_ARGS(x, ...) _NV_REMOVE_FIRST_ARGS1(__VA_ARGS__)
+
+#  define _NV_NUM_ARGS(...)  _NV_NUM_ARGS0(__VA_ARGS__)
+#  define _NV_NUM_ARGS0(...) _NV_EVAL(_NV_NUM_ARGS1(__VA_ARGS__))
+#  define _NV_NUM_ARGS1(...) _NV_IF(_NV_ISEMPTY(__VA_ARGS__), 0, _NV_NUM_ARGS2(__VA_ARGS__))
+#  define _NV_NUM_ARGS2(...) \
+    _NV_ARG32(               \
+      __VA_ARGS__,           \
+      31,                    \
+      30,                    \
+      29,                    \
+      28,                    \
+      27,                    \
+      26,                    \
+      25,                    \
+      24,                    \
+      23,                    \
+      22,                    \
+      21,                    \
+      20,                    \
+      19,                    \
+      18,                    \
+      17,                    \
+      16,                    \
+      15,                    \
+      14,                    \
+      13,                    \
+      12,                    \
+      11,                    \
+      10,                    \
+      9,                     \
+      8,                     \
+      7,                     \
+      6,                     \
+      5,                     \
+      4,                     \
+      3,                     \
+      2,                     \
+      1,                     \
+      0)
+
+#  define _NV_DISPATCH_N_IMPL1(name, ...)        _NV_EVAL(name(__VA_ARGS__))
+#  define _NV_DISPATCH_N_IMPL0(depth, name, ...) _NV_DISPATCH_N_IMPL1(_NV_CONCAT_EVAL(name, depth), __VA_ARGS__)
+#  define _NV_DISPATCH_N_IMPL(name, ...)         _NV_DISPATCH_N_IMPL0(_NV_NUM_ARGS(__VA_ARGS__), name, __VA_ARGS__)
+#  define _NV_DISPATCH_N_ARY(name, ...)          _NV_DISPATCH_N_IMPL(name, __VA_ARGS__)
 
 #endif // C++11
diff --git a/libcudacxx/include/nv/detail/__target_macros b/libcudacxx/include/nv/detail/__target_macros
index 6d108021b41..59df8dfd188 100644
--- a/libcudacxx/include/nv/detail/__target_macros
+++ b/libcudacxx/include/nv/detail/__target_macros
@@ -14,42 +14,42 @@
 #include <nv/detail/__preprocessor>
 
 #if defined(__GNUC__)
-#pragma GCC system_header
+#  pragma GCC system_header
 #endif
 
-#  define _NV_TARGET_ARCH_TO_SELECTOR_350 nv::target::sm_35
-#  define _NV_TARGET_ARCH_TO_SELECTOR_370 nv::target::sm_37
-#  define _NV_TARGET_ARCH_TO_SELECTOR_500 nv::target::sm_50
-#  define _NV_TARGET_ARCH_TO_SELECTOR_520 nv::target::sm_52
-#  define _NV_TARGET_ARCH_TO_SELECTOR_530 nv::target::sm_53
-#  define _NV_TARGET_ARCH_TO_SELECTOR_600 nv::target::sm_60
-#  define _NV_TARGET_ARCH_TO_SELECTOR_610 nv::target::sm_61
-#  define _NV_TARGET_ARCH_TO_SELECTOR_620 nv::target::sm_62
-#  define _NV_TARGET_ARCH_TO_SELECTOR_700 nv::target::sm_70
-#  define _NV_TARGET_ARCH_TO_SELECTOR_720 nv::target::sm_72
-#  define _NV_TARGET_ARCH_TO_SELECTOR_750 nv::target::sm_75
-#  define _NV_TARGET_ARCH_TO_SELECTOR_800 nv::target::sm_80
-#  define _NV_TARGET_ARCH_TO_SELECTOR_860 nv::target::sm_86
-#  define _NV_TARGET_ARCH_TO_SELECTOR_870 nv::target::sm_87
-#  define _NV_TARGET_ARCH_TO_SELECTOR_890 nv::target::sm_89
-#  define _NV_TARGET_ARCH_TO_SELECTOR_900 nv::target::sm_90
-
-#  define _NV_TARGET_ARCH_TO_SM_350 35
-#  define _NV_TARGET_ARCH_TO_SM_370 37
-#  define _NV_TARGET_ARCH_TO_SM_500 50
-#  define _NV_TARGET_ARCH_TO_SM_520 52
-#  define _NV_TARGET_ARCH_TO_SM_530 53
-#  define _NV_TARGET_ARCH_TO_SM_600 60
-#  define _NV_TARGET_ARCH_TO_SM_610 61
-#  define _NV_TARGET_ARCH_TO_SM_620 62
-#  define _NV_TARGET_ARCH_TO_SM_700 70
-#  define _NV_TARGET_ARCH_TO_SM_720 72
-#  define _NV_TARGET_ARCH_TO_SM_750 75
-#  define _NV_TARGET_ARCH_TO_SM_800 80
-#  define _NV_TARGET_ARCH_TO_SM_860 86
-#  define _NV_TARGET_ARCH_TO_SM_870 87
-#  define _NV_TARGET_ARCH_TO_SM_890 89
-#  define _NV_TARGET_ARCH_TO_SM_900 90
+#define _NV_TARGET_ARCH_TO_SELECTOR_350 nv::target::sm_35
+#define _NV_TARGET_ARCH_TO_SELECTOR_370 nv::target::sm_37
+#define _NV_TARGET_ARCH_TO_SELECTOR_500 nv::target::sm_50
+#define _NV_TARGET_ARCH_TO_SELECTOR_520 nv::target::sm_52
+#define _NV_TARGET_ARCH_TO_SELECTOR_530 nv::target::sm_53
+#define _NV_TARGET_ARCH_TO_SELECTOR_600 nv::target::sm_60
+#define _NV_TARGET_ARCH_TO_SELECTOR_610 nv::target::sm_61
+#define _NV_TARGET_ARCH_TO_SELECTOR_620 nv::target::sm_62
+#define _NV_TARGET_ARCH_TO_SELECTOR_700 nv::target::sm_70
+#define _NV_TARGET_ARCH_TO_SELECTOR_720 nv::target::sm_72
+#define _NV_TARGET_ARCH_TO_SELECTOR_750 nv::target::sm_75
+#define _NV_TARGET_ARCH_TO_SELECTOR_800 nv::target::sm_80
+#define _NV_TARGET_ARCH_TO_SELECTOR_860 nv::target::sm_86
+#define _NV_TARGET_ARCH_TO_SELECTOR_870 nv::target::sm_87
+#define _NV_TARGET_ARCH_TO_SELECTOR_890 nv::target::sm_89
+#define _NV_TARGET_ARCH_TO_SELECTOR_900 nv::target::sm_90
+
+#define _NV_TARGET_ARCH_TO_SM_350 35
+#define _NV_TARGET_ARCH_TO_SM_370 37
+#define _NV_TARGET_ARCH_TO_SM_500 50
+#define _NV_TARGET_ARCH_TO_SM_520 52
+#define _NV_TARGET_ARCH_TO_SM_530 53
+#define _NV_TARGET_ARCH_TO_SM_600 60
+#define _NV_TARGET_ARCH_TO_SM_610 61
+#define _NV_TARGET_ARCH_TO_SM_620 62
+#define _NV_TARGET_ARCH_TO_SM_700 70
+#define _NV_TARGET_ARCH_TO_SM_720 72
+#define _NV_TARGET_ARCH_TO_SM_750 75
+#define _NV_TARGET_ARCH_TO_SM_800 80
+#define _NV_TARGET_ARCH_TO_SM_860 86
+#define _NV_TARGET_ARCH_TO_SM_870 87
+#define _NV_TARGET_ARCH_TO_SM_890 89
+#define _NV_TARGET_ARCH_TO_SM_900 90
 
 // Only enable when compiling for CUDA/stdpar
 #if defined(_NV_COMPILER_NVCXX) && defined(_NVHPC_CUDA)
@@ -71,22 +71,22 @@
 #  define _NV_TARGET_VAL_SM_89 nv::target::sm_89
 #  define _NV_TARGET_VAL_SM_90 nv::target::sm_90
 
-#  define _NV_TARGET___NV_IS_HOST nv::target::is_host
+#  define _NV_TARGET___NV_IS_HOST   nv::target::is_host
 #  define _NV_TARGET___NV_IS_DEVICE nv::target::is_device
 
 #  define _NV_TARGET___NV_ANY_TARGET (nv::target::any_target)
-#  define _NV_TARGET___NV_NO_TARGET (nv::target::no_target)
+#  define _NV_TARGET___NV_NO_TARGET  (nv::target::no_target)
 
 #  if defined(NV_TARGET_SM_INTEGER_LIST)
 #    define NV_TARGET_MINIMUM_SM_SELECTOR _NV_FIRST_ARG(NV_TARGET_SM_SELECTOR_LIST)
-#    define NV_TARGET_MINIMUM_SM_INTEGER _NV_FIRST_ARG(NV_TARGET_SM_INTEGER_LIST)
-#    define __CUDA_MINIMUM_ARCH__ _NV_CONCAT_EVAL(_NV_FIRST_ARG(NV_TARGET_SM_INTEGER_LIST), 0)
+#    define NV_TARGET_MINIMUM_SM_INTEGER  _NV_FIRST_ARG(NV_TARGET_SM_INTEGER_LIST)
+#    define __CUDA_MINIMUM_ARCH__         _NV_CONCAT_EVAL(_NV_FIRST_ARG(NV_TARGET_SM_INTEGER_LIST), 0)
 #  endif
 
 #  define _NV_TARGET_PROVIDES(q)   nv::target::provides(q)
 #  define _NV_TARGET_IS_EXACTLY(q) nv::target::is_exactly(q)
 
-#elif defined(_NV_COMPILER_NVCC) || defined (_NV_COMPILER_CLANG_CUDA)
+#elif defined(_NV_COMPILER_NVCC) || defined(_NV_COMPILER_CLANG_CUDA)
 
 #  define _NV_TARGET_VAL_SM_35 350
 #  define _NV_TARGET_VAL_SM_37 370
@@ -106,10 +106,10 @@
 #  define _NV_TARGET_VAL_SM_90 900
 
 #  if defined(__CUDA_ARCH__)
-#    define _NV_TARGET_VAL __CUDA_ARCH__
+#    define _NV_TARGET_VAL                __CUDA_ARCH__
 #    define NV_TARGET_MINIMUM_SM_SELECTOR _NV_CONCAT_EVAL(_NV_TARGET_ARCH_TO_SELECTOR_, __CUDA_ARCH__)
-#    define NV_TARGET_MINIMUM_SM_INTEGER _NV_CONCAT_EVAL(_NV_TARGET_ARCH_TO_SM_, __CUDA_ARCH__)
-#    define __CUDA_MINIMUM_ARCH__ __CUDA_ARCH__
+#    define NV_TARGET_MINIMUM_SM_INTEGER  _NV_CONCAT_EVAL(_NV_TARGET_ARCH_TO_SM_, __CUDA_ARCH__)
+#    define __CUDA_MINIMUM_ARCH__         __CUDA_ARCH__
 #  endif
 
 #  if defined(__CUDA_ARCH__)
@@ -197,22 +197,22 @@
 #define _NV_TARGET___NV_IS_EXACTLY_SM_89 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_89))
 #define _NV_TARGET___NV_IS_EXACTLY_SM_90 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_90))
 
-#define NV_PROVIDES_SM_35   __NV_PROVIDES_SM_35
-#define NV_PROVIDES_SM_37   __NV_PROVIDES_SM_37
-#define NV_PROVIDES_SM_50   __NV_PROVIDES_SM_50
-#define NV_PROVIDES_SM_52   __NV_PROVIDES_SM_52
-#define NV_PROVIDES_SM_53   __NV_PROVIDES_SM_53
-#define NV_PROVIDES_SM_60   __NV_PROVIDES_SM_60
-#define NV_PROVIDES_SM_61   __NV_PROVIDES_SM_61
-#define NV_PROVIDES_SM_62   __NV_PROVIDES_SM_62
-#define NV_PROVIDES_SM_70   __NV_PROVIDES_SM_70
-#define NV_PROVIDES_SM_72   __NV_PROVIDES_SM_72
-#define NV_PROVIDES_SM_75   __NV_PROVIDES_SM_75
-#define NV_PROVIDES_SM_80   __NV_PROVIDES_SM_80
-#define NV_PROVIDES_SM_86   __NV_PROVIDES_SM_86
-#define NV_PROVIDES_SM_87   __NV_PROVIDES_SM_87
-#define NV_PROVIDES_SM_89   __NV_PROVIDES_SM_89
-#define NV_PROVIDES_SM_90   __NV_PROVIDES_SM_90
+#define NV_PROVIDES_SM_35 __NV_PROVIDES_SM_35
+#define NV_PROVIDES_SM_37 __NV_PROVIDES_SM_37
+#define NV_PROVIDES_SM_50 __NV_PROVIDES_SM_50
+#define NV_PROVIDES_SM_52 __NV_PROVIDES_SM_52
+#define NV_PROVIDES_SM_53 __NV_PROVIDES_SM_53
+#define NV_PROVIDES_SM_60 __NV_PROVIDES_SM_60
+#define NV_PROVIDES_SM_61 __NV_PROVIDES_SM_61
+#define NV_PROVIDES_SM_62 __NV_PROVIDES_SM_62
+#define NV_PROVIDES_SM_70 __NV_PROVIDES_SM_70
+#define NV_PROVIDES_SM_72 __NV_PROVIDES_SM_72
+#define NV_PROVIDES_SM_75 __NV_PROVIDES_SM_75
+#define NV_PROVIDES_SM_80 __NV_PROVIDES_SM_80
+#define NV_PROVIDES_SM_86 __NV_PROVIDES_SM_86
+#define NV_PROVIDES_SM_87 __NV_PROVIDES_SM_87
+#define NV_PROVIDES_SM_89 __NV_PROVIDES_SM_89
+#define NV_PROVIDES_SM_90 __NV_PROVIDES_SM_90
 
 #define NV_IS_EXACTLY_SM_35 __NV_IS_EXACTLY_SM_35
 #define NV_IS_EXACTLY_SM_37 __NV_IS_EXACTLY_SM_37
@@ -235,11 +235,11 @@
 // Will re-enable for nvcc below.
 #define NV_HAS_FEATURE_SM_90a NV_NO_TARGET
 
-#define NV_IS_HOST         __NV_IS_HOST
-#define NV_IS_DEVICE       __NV_IS_DEVICE
+#define NV_IS_HOST   __NV_IS_HOST
+#define NV_IS_DEVICE __NV_IS_DEVICE
 
-#define NV_ANY_TARGET      __NV_ANY_TARGET
-#define NV_NO_TARGET       __NV_NO_TARGET
+#define NV_ANY_TARGET __NV_ANY_TARGET
+#define NV_NO_TARGET  __NV_NO_TARGET
 
 // Platform invoke mechanisms
 #if defined(_NV_COMPILER_NVCXX) && defined(_NVHPC_CUDA)
@@ -249,11 +249,9 @@
 #  define _NV_BLOCK_EXPAND(...) _NV_REMOVE_PAREN(__VA_ARGS__)
 
 #  define _NV_TARGET_IF(cond, t, ...) \
-    (if target _NV_ARCH_COND(cond) {    \
-      _NV_BLOCK_EXPAND(t)        \
-    } else { _NV_BLOCK_EXPAND(__VA_ARGS__) })
+    (if target _NV_ARCH_COND(cond) { _NV_BLOCK_EXPAND(t) } else {_NV_BLOCK_EXPAND(__VA_ARGS__)})
 
-#elif defined(_NV_COMPILER_NVCC) || defined (_NV_COMPILER_CLANG_CUDA)
+#elif defined(_NV_COMPILER_NVCC) || defined(_NV_COMPILER_CLANG_CUDA)
 
 #  if (_NV_TARGET___NV_IS_EXACTLY_SM_35)
 #    define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_35 1
@@ -353,7 +351,7 @@
 
 // Re-enable sm_90a support in nvcc.
 #  undef NV_HAS_FEATURE_SM_90a
-#  define NV_HAS_FEATURE_SM_90a   __NV_HAS_FEATURE_SM_90a
+#  define NV_HAS_FEATURE_SM_90a __NV_HAS_FEATURE_SM_90a
 #  if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
 #    define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_90a 1
 #  else
@@ -369,7 +367,7 @@
 #  endif
 
 #  define _NV_TARGET_BOOL___NV_ANY_TARGET 1
-#  define _NV_TARGET_BOOL___NV_NO_TARGET 0
+#  define _NV_TARGET_BOOL___NV_NO_TARGET  0
 
 // NVCC Greater than stuff
 
@@ -470,18 +468,24 @@
 #  endif
 
 #  define _NV_ARCH_COND_CAT1(cond) _NV_TARGET_BOOL_##cond
-#  define _NV_ARCH_COND_CAT(cond) _NV_EVAL(_NV_ARCH_COND_CAT1(cond))
+#  define _NV_ARCH_COND_CAT(cond)  _NV_EVAL(_NV_ARCH_COND_CAT1(cond))
 
-#    define _NV_TARGET_EMPTY_PARAM ;
+#  define _NV_TARGET_EMPTY_PARAM ;
 
 #  if defined(_NV_TARGET_CPP11)
 
-#    define _NV_BLOCK_EXPAND(...) { _NV_REMOVE_PAREN(__VA_ARGS__) }
-#    define _NV_TARGET_IF(cond, t, ...) _NV_IF( _NV_ARCH_COND_CAT(cond), t, __VA_ARGS__)
+#    define _NV_BLOCK_EXPAND(...)     \
+      {                               \
+        _NV_REMOVE_PAREN(__VA_ARGS__) \
+      }
+#    define _NV_TARGET_IF(cond, t, ...) _NV_IF(_NV_ARCH_COND_CAT(cond), t, __VA_ARGS__)
 
 #  else // <C++11 fallback
 
-#    define _NV_BLOCK_EXPAND(x) { x }
+#    define _NV_BLOCK_EXPAND(x) \
+      {                         \
+        x                       \
+      }
 
 #    define _NV_TARGET_IF(cond, t)         _NV_IF(_NV_ARCH_COND_CAT(cond), t, _NV_TARGET_EMPTY_PARAM)
 #    define _NV_TARGET_IF_ELSE(cond, t, f) _NV_IF(_NV_ARCH_COND_CAT(cond), t, f)
@@ -493,28 +497,28 @@
 #if defined(_NV_TARGET_CPP11)
 
 #  define _NV_TARGET_DISPATCH_HANDLE0()
-#  define _NV_TARGET_DISPATCH_HANDLE2(q, fn)        _NV_TARGET_IF(q, fn)
-#  define _NV_TARGET_DISPATCH_HANDLE4(q, fn, ...)   _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE2(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE6(q, fn, ...)   _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE4(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE8(q, fn, ...)   _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE6(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE10(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE8(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE12(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE10(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE14(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE12(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE16(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE14(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE18(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE16(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE20(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE18(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE22(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE20(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE24(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE22(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE26(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE24(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE28(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE26(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE30(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE28(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE32(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE30(__VA_ARGS__))
-#  define _NV_TARGET_DISPATCH_HANDLE34(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE32(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE2(q, fn)       _NV_TARGET_IF(q, fn)
+#  define _NV_TARGET_DISPATCH_HANDLE4(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE2(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE6(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE4(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE8(q, fn, ...)  _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE6(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE10(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE8(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE12(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE10(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE14(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE12(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE16(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE14(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE18(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE16(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE20(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE18(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE22(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE20(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE24(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE22(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE26(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE24(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE28(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE26(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE30(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE28(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE32(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE30(__VA_ARGS__))
+#  define _NV_TARGET_DISPATCH_HANDLE34(q, fn, ...) _NV_TARGET_IF(q, fn, _NV_TARGET_DISPATCH_HANDLE32(__VA_ARGS__))
 
 #  define _NV_TARGET_DISPATCH(...) _NV_BLOCK_EXPAND(_NV_DISPATCH_N_ARY(_NV_TARGET_DISPATCH_HANDLE, __VA_ARGS__))
 
 // NV_IF_TARGET supports a false statement provided as a variadic macro
-#  define NV_IF_TARGET(cond, ...)    _NV_BLOCK_EXPAND(_NV_TARGET_IF(cond, __VA_ARGS__))
+#  define NV_IF_TARGET(cond, ...)       _NV_BLOCK_EXPAND(_NV_TARGET_IF(cond, __VA_ARGS__))
 #  define NV_IF_ELSE_TARGET(cond, t, f) _NV_BLOCK_EXPAND(_NV_TARGET_IF(cond, t, f))
 #  define NV_DISPATCH_TARGET(...)       _NV_TARGET_DISPATCH(__VA_ARGS__)
 
diff --git a/libcudacxx/include/nv/target b/libcudacxx/include/nv/target
index 0219d6caf8f..381f5341cdc 100644
--- a/libcudacxx/include/nv/target
+++ b/libcudacxx/include/nv/target
@@ -24,177 +24,192 @@
 #  define _NV_COMPILER_CLANG_CUDA
 #endif
 
-#if (!defined(__ibmxl__)) && \
-    ((defined(__cplusplus) && __cplusplus >= 201103L) || \
-     (defined(_MSC_VER) && _MSVC_LANG >= 201103L))
+#if (!defined(__ibmxl__)) \
+  && ((defined(__cplusplus) && __cplusplus >= 201103L) || (defined(_MSC_VER) && _MSVC_LANG >= 201103L))
 #  define _NV_TARGET_CPP11
 #endif
 
-
 // Hide `if target` support from NVRTC
 #if defined(_NV_TARGET_CPP11) && !defined(__CUDACC_RTC__)
 
-#if defined(_NV_COMPILER_NVCXX)
-#  define _NV_BITSET_ATTRIBUTE [[nv::__target_bitset]]
-#else
-#  define _NV_BITSET_ATTRIBUTE
-#endif
+#  if defined(_NV_COMPILER_NVCXX)
+#    define _NV_BITSET_ATTRIBUTE [[nv::__target_bitset]]
+#  else
+#    define _NV_BITSET_ATTRIBUTE
+#  endif
+
+namespace nv
+{
+namespace target
+{
+namespace detail
+{
+
+typedef unsigned long long base_int_t;
+
+// No host specialization
+constexpr base_int_t all_hosts = 1;
+
+// NVIDIA GPUs
+constexpr base_int_t sm_35_bit = 1 << 1;
+constexpr base_int_t sm_37_bit = 1 << 2;
+constexpr base_int_t sm_50_bit = 1 << 3;
+constexpr base_int_t sm_52_bit = 1 << 4;
+constexpr base_int_t sm_53_bit = 1 << 5;
+constexpr base_int_t sm_60_bit = 1 << 6;
+constexpr base_int_t sm_61_bit = 1 << 7;
+constexpr base_int_t sm_62_bit = 1 << 8;
+constexpr base_int_t sm_70_bit = 1 << 9;
+constexpr base_int_t sm_72_bit = 1 << 10;
+constexpr base_int_t sm_75_bit = 1 << 11;
+constexpr base_int_t sm_80_bit = 1 << 12;
+constexpr base_int_t sm_86_bit = 1 << 13;
+constexpr base_int_t sm_87_bit = 1 << 14;
+constexpr base_int_t sm_89_bit = 1 << 15;
+constexpr base_int_t sm_90_bit = 1 << 16;
+constexpr base_int_t all_devices =
+  sm_35_bit | sm_37_bit | sm_50_bit | sm_52_bit | sm_53_bit | sm_60_bit | sm_61_bit | sm_62_bit | sm_70_bit | sm_72_bit
+  | sm_75_bit | sm_80_bit | sm_86_bit | sm_87_bit | sm_89_bit | sm_90_bit;
+
+// Store a set of targets as a set of bits
+struct _NV_BITSET_ATTRIBUTE target_description
+{
+  base_int_t targets;
+
+  constexpr target_description(base_int_t a)
+      : targets(a)
+  {}
+};
+
+// The type of the user-visible names of the NVIDIA GPU targets
+enum class sm_selector : base_int_t
+{
+  sm_35 = 35,
+  sm_37 = 37,
+  sm_50 = 50,
+  sm_52 = 52,
+  sm_53 = 53,
+  sm_60 = 60,
+  sm_61 = 61,
+  sm_62 = 62,
+  sm_70 = 70,
+  sm_72 = 72,
+  sm_75 = 75,
+  sm_80 = 80,
+  sm_86 = 86,
+  sm_87 = 87,
+  sm_89 = 89,
+  sm_90 = 90,
+};
+
+constexpr base_int_t toint(sm_selector a)
+{
+  return static_cast<base_int_t>(a);
+}
+
+constexpr base_int_t bitexact(sm_selector a)
+{
+  return toint(a) == 35 ? sm_35_bit
+       : toint(a) == 37 ? sm_37_bit
+       : toint(a) == 50 ? sm_50_bit
+       : toint(a) == 52 ? sm_52_bit
+       : toint(a) == 53 ? sm_53_bit
+       : toint(a) == 60 ? sm_60_bit
+       : toint(a) == 61 ? sm_61_bit
+       : toint(a) == 62 ? sm_62_bit
+       : toint(a) == 70 ? sm_70_bit
+       : toint(a) == 72 ? sm_72_bit
+       : toint(a) == 75 ? sm_75_bit
+       : toint(a) == 80 ? sm_80_bit
+       : toint(a) == 86 ? sm_86_bit
+       : toint(a) == 87 ? sm_87_bit
+       : toint(a) == 89 ? sm_89_bit
+       : toint(a) == 90 ? sm_90_bit
+                        : 0;
+}
+
+constexpr base_int_t bitrounddown(sm_selector a)
+{
+  return toint(a) >= 90 ? sm_90_bit
+       : toint(a) >= 89 ? sm_89_bit
+       : toint(a) >= 87 ? sm_87_bit
+       : toint(a) >= 86 ? sm_86_bit
+       : toint(a) >= 80 ? sm_80_bit
+       : toint(a) >= 75 ? sm_75_bit
+       : toint(a) >= 72 ? sm_72_bit
+       : toint(a) >= 70 ? sm_70_bit
+       : toint(a) >= 62 ? sm_62_bit
+       : toint(a) >= 61 ? sm_61_bit
+       : toint(a) >= 60 ? sm_60_bit
+       : toint(a) >= 53 ? sm_53_bit
+       : toint(a) >= 52 ? sm_52_bit
+       : toint(a) >= 50 ? sm_50_bit
+       : toint(a) >= 37 ? sm_37_bit
+       : toint(a) >= 35 ? sm_35_bit
+                        : 0;
+}
+
+// Public API for NVIDIA GPUs
+
+constexpr target_description is_exactly(sm_selector a)
+{
+  return target_description(bitexact(a));
+}
+
+constexpr target_description provides(sm_selector a)
+{
+  return target_description(~(bitrounddown(a) - 1) & all_devices);
+}
+
+// Boolean operations on target sets
+
+constexpr target_description operator&&(target_description a, target_description b)
+{
+  return target_description(a.targets & b.targets);
+}
+
+constexpr target_description operator||(target_description a, target_description b)
+{
+  return target_description(a.targets | b.targets);
+}
 
-namespace nv {
-  namespace target {
-    namespace detail {
-
-      typedef unsigned long long base_int_t;
-
-      // No host specialization
-      constexpr base_int_t all_hosts = 1;
-
-      // NVIDIA GPUs
-      constexpr base_int_t sm_35_bit = 1 << 1;
-      constexpr base_int_t sm_37_bit = 1 << 2;
-      constexpr base_int_t sm_50_bit = 1 << 3;
-      constexpr base_int_t sm_52_bit = 1 << 4;
-      constexpr base_int_t sm_53_bit = 1 << 5;
-      constexpr base_int_t sm_60_bit = 1 << 6;
-      constexpr base_int_t sm_61_bit = 1 << 7;
-      constexpr base_int_t sm_62_bit = 1 << 8;
-      constexpr base_int_t sm_70_bit = 1 << 9;
-      constexpr base_int_t sm_72_bit = 1 << 10;
-      constexpr base_int_t sm_75_bit = 1 << 11;
-      constexpr base_int_t sm_80_bit = 1 << 12;
-      constexpr base_int_t sm_86_bit = 1 << 13;
-      constexpr base_int_t sm_87_bit = 1 << 14;
-      constexpr base_int_t sm_89_bit = 1 << 15;
-      constexpr base_int_t sm_90_bit = 1 << 16;
-      constexpr base_int_t all_devices =
-          sm_35_bit | sm_37_bit |
-          sm_50_bit | sm_52_bit | sm_53_bit |
-          sm_60_bit | sm_61_bit | sm_62_bit |
-          sm_70_bit | sm_72_bit | sm_75_bit |
-          sm_80_bit | sm_86_bit | sm_87_bit |
-          sm_89_bit | sm_90_bit;
-
-      // Store a set of targets as a set of bits
-      struct _NV_BITSET_ATTRIBUTE target_description {
-        base_int_t targets;
-
-        constexpr target_description(base_int_t a) : targets(a) { }
-      };
-
-      // The type of the user-visible names of the NVIDIA GPU targets
-      enum class sm_selector : base_int_t {
-        sm_35 = 35, sm_37 = 37,
-        sm_50 = 50, sm_52 = 52, sm_53 = 53,
-        sm_60 = 60, sm_61 = 61, sm_62 = 62,
-        sm_70 = 70, sm_72 = 72, sm_75 = 75,
-        sm_80 = 80, sm_86 = 86, sm_87 = 87,
-        sm_89 = 89, sm_90 = 90,
-      };
-
-      constexpr base_int_t toint(sm_selector a) {
-        return static_cast<base_int_t>(a);
-      }
-
-      constexpr base_int_t bitexact(sm_selector a) {
-        return toint(a) == 35 ? sm_35_bit :
-               toint(a) == 37 ? sm_37_bit :
-               toint(a) == 50 ? sm_50_bit :
-               toint(a) == 52 ? sm_52_bit :
-               toint(a) == 53 ? sm_53_bit :
-               toint(a) == 60 ? sm_60_bit :
-               toint(a) == 61 ? sm_61_bit :
-               toint(a) == 62 ? sm_62_bit :
-               toint(a) == 70 ? sm_70_bit :
-               toint(a) == 72 ? sm_72_bit :
-               toint(a) == 75 ? sm_75_bit :
-               toint(a) == 80 ? sm_80_bit :
-               toint(a) == 86 ? sm_86_bit :
-               toint(a) == 87 ? sm_87_bit :
-               toint(a) == 89 ? sm_89_bit :
-               toint(a) == 90 ? sm_90_bit : 0;
-      }
-
-      constexpr base_int_t bitrounddown(sm_selector a) {
-        return toint(a) >= 90 ? sm_90_bit :
-               toint(a) >= 89 ? sm_89_bit :
-               toint(a) >= 87 ? sm_87_bit :
-               toint(a) >= 86 ? sm_86_bit :
-               toint(a) >= 80 ? sm_80_bit :
-               toint(a) >= 75 ? sm_75_bit :
-               toint(a) >= 72 ? sm_72_bit :
-               toint(a) >= 70 ? sm_70_bit :
-               toint(a) >= 62 ? sm_62_bit :
-               toint(a) >= 61 ? sm_61_bit :
-               toint(a) >= 60 ? sm_60_bit :
-               toint(a) >= 53 ? sm_53_bit :
-               toint(a) >= 52 ? sm_52_bit :
-               toint(a) >= 50 ? sm_50_bit :
-               toint(a) >= 37 ? sm_37_bit :
-               toint(a) >= 35 ? sm_35_bit : 0;
-      }
-
-      // Public API for NVIDIA GPUs
-
-      constexpr target_description is_exactly(sm_selector a) {
-        return target_description(bitexact(a));
-      }
-
-      constexpr target_description provides(sm_selector a) {
-        return target_description(~(bitrounddown(a) - 1) & all_devices);
-      }
-
-      // Boolean operations on target sets
-
-      constexpr target_description operator&&(target_description a,
-                                              target_description b) {
-        return target_description(a.targets & b.targets);
-      }
-
-      constexpr target_description operator||(target_description a,
-                                              target_description b) {
-        return target_description(a.targets | b.targets);
-      }
-
-      constexpr target_description operator!(target_description a) {
-        return target_description(~a.targets & (all_devices | all_hosts));
-      }
-    }
-
-    using detail::target_description;
-    using detail::sm_selector;
-
-    // The predicates for basic host/device selection
-    constexpr target_description is_host =
-      target_description(detail::all_hosts);
-    constexpr target_description is_device =
-      target_description(detail::all_devices);
-    constexpr target_description any_target =
-      target_description(detail::all_hosts | detail::all_devices);
-    constexpr target_description no_target =
-      target_description(0);
-
-    // The public names for NVIDIA GPU architectures
-    constexpr sm_selector sm_35 = sm_selector::sm_35;
-    constexpr sm_selector sm_37 = sm_selector::sm_37;
-    constexpr sm_selector sm_50 = sm_selector::sm_50;
-    constexpr sm_selector sm_52 = sm_selector::sm_52;
-    constexpr sm_selector sm_53 = sm_selector::sm_53;
-    constexpr sm_selector sm_60 = sm_selector::sm_60;
-    constexpr sm_selector sm_61 = sm_selector::sm_61;
-    constexpr sm_selector sm_62 = sm_selector::sm_62;
-    constexpr sm_selector sm_70 = sm_selector::sm_70;
-    constexpr sm_selector sm_72 = sm_selector::sm_72;
-    constexpr sm_selector sm_75 = sm_selector::sm_75;
-    constexpr sm_selector sm_80 = sm_selector::sm_80;
-    constexpr sm_selector sm_86 = sm_selector::sm_86;
-    constexpr sm_selector sm_87 = sm_selector::sm_87;
-    constexpr sm_selector sm_89 = sm_selector::sm_89;
-    constexpr sm_selector sm_90 = sm_selector::sm_90;
-
-    using detail::is_exactly;
-    using detail::provides;
-  }
+constexpr target_description operator!(target_description a)
+{
+  return target_description(~a.targets & (all_devices | all_hosts));
 }
+} // namespace detail
+
+using detail::sm_selector;
+using detail::target_description;
+
+// The predicates for basic host/device selection
+constexpr target_description is_host    = target_description(detail::all_hosts);
+constexpr target_description is_device  = target_description(detail::all_devices);
+constexpr target_description any_target = target_description(detail::all_hosts | detail::all_devices);
+constexpr target_description no_target  = target_description(0);
+
+// The public names for NVIDIA GPU architectures
+constexpr sm_selector sm_35 = sm_selector::sm_35;
+constexpr sm_selector sm_37 = sm_selector::sm_37;
+constexpr sm_selector sm_50 = sm_selector::sm_50;
+constexpr sm_selector sm_52 = sm_selector::sm_52;
+constexpr sm_selector sm_53 = sm_selector::sm_53;
+constexpr sm_selector sm_60 = sm_selector::sm_60;
+constexpr sm_selector sm_61 = sm_selector::sm_61;
+constexpr sm_selector sm_62 = sm_selector::sm_62;
+constexpr sm_selector sm_70 = sm_selector::sm_70;
+constexpr sm_selector sm_72 = sm_selector::sm_72;
+constexpr sm_selector sm_75 = sm_selector::sm_75;
+constexpr sm_selector sm_80 = sm_selector::sm_80;
+constexpr sm_selector sm_86 = sm_selector::sm_86;
+constexpr sm_selector sm_87 = sm_selector::sm_87;
+constexpr sm_selector sm_89 = sm_selector::sm_89;
+constexpr sm_selector sm_90 = sm_selector::sm_90;
+
+using detail::is_exactly;
+using detail::provides;
+} // namespace target
+} // namespace nv
 
 #endif // C++11  && !defined(__CUDACC_RTC__)