From 67d6509958751cc31194a845c9f305665a5aaab5 Mon Sep 17 00:00:00 2001
From: Himanshu <addyjeridiq@gmail.com>
Date: Thu, 15 Apr 2021 00:35:00 +0530
Subject: [PATCH 1/7] Update device_radix_sort.cuh

---
 cub/device/device_radix_sort.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cub/device/device_radix_sort.cuh b/cub/device/device_radix_sort.cuh
index 7534c508cc..74c9405bea 100644
--- a/cub/device/device_radix_sort.cuh
+++ b/cub/device/device_radix_sort.cuh
@@ -67,6 +67,7 @@ namespace cub {
  * method can only be applied to unsigned integral types, DeviceRadixSort
  * is able to sort signed and floating-point types via simple bit-wise transformations
  * that ensure lexicographic key ordering. For floating-point types -0.0 and +0.0 are
+ * cub::DeviceRadixSort is stable
  * considered equal and appear in the result in the same order as they appear in
  * the input.
  *

From f014cc58bd671bc6dd4eeb0960e26c06749cfa62 Mon Sep 17 00:00:00 2001
From: Himanshu <addyjeridiq@gmail.com>
Date: Thu, 15 Apr 2021 00:38:20 +0530
Subject: [PATCH 2/7] Update device_radix_sort.cuh

---
 cub/device/device_radix_sort.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cub/device/device_radix_sort.cuh b/cub/device/device_radix_sort.cuh
index 74c9405bea..33aaf7ceb7 100644
--- a/cub/device/device_radix_sort.cuh
+++ b/cub/device/device_radix_sort.cuh
@@ -66,8 +66,7 @@ namespace cub {
  * half-precision floating-point type.  Although the direct radix sorting
  * method can only be applied to unsigned integral types, DeviceRadixSort
  * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering. For floating-point types -0.0 and +0.0 are
- * cub::DeviceRadixSort is stable
+ * that ensure lexicographic key ordering. DeviceRadixSort is stable For floating-point types -0.0 and +0.0 are
  * considered equal and appear in the result in the same order as they appear in
  * the input.
  *

From bc016afdc37325e1e98f52906bf991d40db49842 Mon Sep 17 00:00:00 2001
From: Himanshu <addyjeridiq@gmail.com>
Date: Thu, 15 Apr 2021 22:19:44 +0530
Subject: [PATCH 3/7] Update device_radix_sort.cuh

---
 cub/device/device_radix_sort.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cub/device/device_radix_sort.cuh b/cub/device/device_radix_sort.cuh
index 33aaf7ceb7..45f14c614d 100644
--- a/cub/device/device_radix_sort.cuh
+++ b/cub/device/device_radix_sort.cuh
@@ -66,7 +66,7 @@ namespace cub {
  * half-precision floating-point type.  Although the direct radix sorting
  * method can only be applied to unsigned integral types, DeviceRadixSort
  * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering. DeviceRadixSort is stable For floating-point types -0.0 and +0.0 are
+ * that ensure lexicographic key ordering. DeviceRadixSort is stable. For floating-point types -0.0 and +0.0 are
  * considered equal and appear in the result in the same order as they appear in
  * the input.
  *

From 1966bf33e697f3f00b24e8e0fcaeea0019b20e75 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Wed, 12 May 2021 17:16:46 -0400
Subject: [PATCH 4/7] Add more notices about radix sort stability.

---
 cub/block/block_radix_sort.cuh             | 6 +++---
 cub/device/device_radix_sort.cuh           | 6 +++---
 cub/device/device_segmented_radix_sort.cuh | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cub/block/block_radix_sort.cuh b/cub/block/block_radix_sort.cuh
index b27148b85c..7d3bc93073 100644
--- a/cub/block/block_radix_sort.cuh
+++ b/cub/block/block_radix_sort.cuh
@@ -77,9 +77,9 @@ namespace cub {
  *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
  *   method can only be applied to unsigned integral types, BlockRadixSort
  *   is able to sort signed and floating-point types via simple bit-wise transformations
- *   that ensure lexicographic key ordering. For floating-point types -0.0 and +0.0 are
- *   considered equal and appear in the result in the same order as they appear in
- *   the input.
+ *   that ensure lexicographic key ordering. BlockRadixSort is stable. For
+ *   floating-point types -0.0 and +0.0 are considered equal and appear in the
+ *   result in the same order as they appear in the input.
  * - \rowmajor
  *
  * \par Performance Considerations
diff --git a/cub/device/device_radix_sort.cuh b/cub/device/device_radix_sort.cuh
index 45f14c614d..983ff0cf57 100644
--- a/cub/device/device_radix_sort.cuh
+++ b/cub/device/device_radix_sort.cuh
@@ -66,9 +66,9 @@ namespace cub {
  * half-precision floating-point type.  Although the direct radix sorting
  * method can only be applied to unsigned integral types, DeviceRadixSort
  * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering. DeviceRadixSort is stable. For floating-point types -0.0 and +0.0 are
- * considered equal and appear in the result in the same order as they appear in
- * the input.
+ * that ensure lexicographic key ordering. DeviceRadixSort is stable. For
+ * floating-point types -0.0 and +0.0 are considered equal and appear in the
+ * result in the same order as they appear in the input.
  *
  * \par Usage Considerations
  * \cdp_class{DeviceRadixSort}
diff --git a/cub/device/device_segmented_radix_sort.cuh b/cub/device/device_segmented_radix_sort.cuh
index b03a2bafb6..c5c7fe9304 100644
--- a/cub/device/device_segmented_radix_sort.cuh
+++ b/cub/device/device_segmented_radix_sort.cuh
@@ -66,9 +66,9 @@ namespace cub {
  * half-precision floating-point type.  Although the direct radix sorting
  * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort
  * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering. For floating-point types -0.0 and +0.0 are
- * considered equal and appear in the result in the same order as they appear in
- * the input.
+ * that ensure lexicographic key ordering. DeviceSegmentedRadixSort is stable. For
+ * floating-point types -0.0 and +0.0 are considered equal and appear in the
+ * result in the same order as they appear in the input.
  *
  * \par Usage Considerations
  * \cdp_class{DeviceSegmentedRadixSort}

From 2e459727e7e798427e356daa0a8baa8e50bb0e50 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Thu, 20 May 2021 15:46:41 -0400
Subject: [PATCH 5/7] Document RadixSort twiddling and its impact on bit-range
 truncation.

For both Device and Block algorithms. Note that these do not handle
twiddling during descending sort in the same way.
---
 cub/block/block_radix_sort.cuh   | 64 +++++++++++++++++++++++---------
 cub/device/device_radix_sort.cuh | 43 +++++++++++++++++----
 2 files changed, 83 insertions(+), 24 deletions(-)

diff --git a/cub/block/block_radix_sort.cuh b/cub/block/block_radix_sort.cuh
index 7d3bc93073..7919384ba4 100644
--- a/cub/block/block_radix_sort.cuh
+++ b/cub/block/block_radix_sort.cuh
@@ -64,23 +64,53 @@ namespace cub {
  * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
  *
  * \par Overview
- * - The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- *   items into ascending order.  It relies upon a positional representation for
- *   keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- *   characters, etc.) specified from least-significant to most-significant.  For a
- *   given input sequence of keys and a set of rules specifying a total ordering
- *   of the symbolic alphabet, the radix sorting method produces a lexicographic
- *   ordering of those keys.
- * - BlockRadixSort can sort all of the built-in C++ numeric primitive types
- *   (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
- *   half-precision floating-point type. Within each key, the implementation treats fixed-length
- *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
- *   method can only be applied to unsigned integral types, BlockRadixSort
- *   is able to sort signed and floating-point types via simple bit-wise transformations
- *   that ensure lexicographic key ordering. BlockRadixSort is stable. For
- *   floating-point types -0.0 and +0.0 are considered equal and appear in the
- *   result in the same order as they appear in the input.
- * - \rowmajor
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending order.  It relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \rowmajor
+ *
+ * \par Supported Types
+ * BlockRadixSort can sort all of the built-in C++ numeric primitive types
+ * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ * half-precision floating-point type.
+ *
+ * \par Bitwise Key Transformations
+ * Although the direct radix sorting method can only be applied to unsigned
+ * integral types, BlockRadixSort is able to sort signed and floating-point
+ * types via simple bit-wise transformations that ensure lexicographic key
+ * ordering.
+ *
+ * These transformations must be considered when restricting the
+ * `[begin_bit, end_bit)` range, as the bitwise transformations will occur
+ * before the bit-range truncation.
+ *
+ * Any transformations applied to the keys prior to sorting are reversed
+ * while writing to the final output buffer.
+ *
+ * \par Type Specific Bitwise Transformations
+ * To convert the input values into a radix-sortable bitwise representation,
+ * the following transformations take place prior to sorting:
+ *
+ * - For unsigned integral values, the keys are used directly.
+ * - For signed integral values, the sign bit is inverted.
+ * - For positive floating point values, the sign bit is inverted.
+ * - For negative floating point values, the full key is inverted.
+ *
+ * \par No Descending Sort Transformations
+ * Unlike `DeviceRadixSort`, `BlockRadixSort` does not invert the input key bits
+ * when performing a descending sort. Instead, it has special logic to reverse
+ * the order of the keys while sorting.
+ *
+ * \par Stability
+ * BlockRadixSort is stable. For floating-point types -0.0 and +0.0
+ * are considered equal and appear in the result in the same order as they
+ * appear in the input.
+ *
  *
  * \par Performance Considerations
  * - \granularity
diff --git a/cub/device/device_radix_sort.cuh b/cub/device/device_radix_sort.cuh
index 983ff0cf57..57971159c7 100644
--- a/cub/device/device_radix_sort.cuh
+++ b/cub/device/device_radix_sort.cuh
@@ -60,15 +60,44 @@ namespace cub {
  * of the symbolic alphabet, the radix sorting method produces a lexicographic
  * ordering of those keys.
  *
- * \par
+ * \par Supported Types
  * DeviceRadixSort can sort all of the built-in C++ numeric primitive types
  * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
- * half-precision floating-point type.  Although the direct radix sorting
- * method can only be applied to unsigned integral types, DeviceRadixSort
- * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering. DeviceRadixSort is stable. For
- * floating-point types -0.0 and +0.0 are considered equal and appear in the
- * result in the same order as they appear in the input.
+ * half-precision floating-point type.
+ *
+ * \par Transformations
+ * Although the direct radix sorting method can only be applied to unsigned
+ * integral types, DeviceRadixSort is able to sort signed and floating-point
+ * types via simple bit-wise transformations that ensure lexicographic key
+ * ordering. Additional transformations occur for descending sorts. These
+ * transformations must be considered when restricting the
+ * `[begin_bit, end_bit)` range, as the bitwise transformations will occur
+ * before the bit-range truncation.
+ *
+ * Any transformations applied to the keys prior to sorting are reversed
+ * while writing to the final output buffer.
+ *
+ * \par Type Specific Bitwise Transformations
+ * To convert the input values into a radix-sortable bitwise representation,
+ * the following transformations take place prior to sorting:
+ *
+ * - For unsigned integral values, the keys are used directly.
+ * - For signed integral values, the sign bit is inverted.
+ * - For positive floating point values, the sign bit is inverted.
+ * - For negative floating point values, the full key is inverted.
+ *
+ * For floating point types, positive and negative zero are a special case and
+ * will be considered equivalent during sorting.
+ *
+ * \par Descending Sort Bitwise Transformations
+ * If descending sort is used, the keys are inverted after performing any
+ * type-specific transformations, and the resulting keys are sorted in ascending
+ * order.
+ *
+ * \par Stability
+ * DeviceRadixSort is stable. For floating-point types, -0.0 and +0.0 are
+ * considered equal and appear in the result in the same order as they appear in
+ * the input.
  *
  * \par Usage Considerations
  * \cdp_class{DeviceRadixSort}

From d12cb62dbb2add413e0e345cf25b380236dea61a Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Thu, 20 May 2021 15:48:59 -0400
Subject: [PATCH 6/7] Document restrictions on DeviceRadixSort input ranges.

---
 cub/device/device_radix_sort.cuh | 102 +++++++++++++++++++++++++++----
 1 file changed, 89 insertions(+), 13 deletions(-)

diff --git a/cub/device/device_radix_sort.cuh b/cub/device/device_radix_sort.cuh
index 57971159c7..d9c4f3c723 100644
--- a/cub/device/device_radix_sort.cuh
+++ b/cub/device/device_radix_sort.cuh
@@ -122,7 +122,15 @@ struct DeviceRadixSort
      * \brief Sorts key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
      *
      * \par
-     * - The contents of the input data are not altered by the sorting operation
+     * - The contents of the input data are not altered by the sorting operation.
+     * - Pointers to contiguous memory must be used; iterators are not currently
+     *   supported.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys_in,    d_keys_in    + num_items)`
+     *   - `[d_keys_out,   d_keys_out   + num_items)`
+     *   - `[d_values_in,  d_values_in  + num_items)`
+     *   - `[d_values_out, d_values_out + num_items)`
      * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
      * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
      * - \devicestorage
@@ -191,6 +199,11 @@ struct DeviceRadixSort
         // Signed integer type for global offsets
         typedef int OffsetT;
 
+        // We cast away const-ness, but will *not* write to these arrays.
+        // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+        // create a new double-buffer internally when the `is_overwrite_ok` flag
+        // is not set.
+        constexpr bool is_overwrite_okay = false;
         DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
         DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
 
@@ -202,7 +215,7 @@ struct DeviceRadixSort
             num_items,
             begin_bit,
             end_bit,
-            false,
+            is_overwrite_okay,
             stream,
             debug_synchronous);
     }
@@ -218,6 +231,12 @@ struct DeviceRadixSort
      *   contains the input data to be sorted).
      * - The contents of both buffers within each pair may be altered by the sorting
      *   operation.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+     *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+     *   - `[d_values.Current(),   d_values.Current()   + num_items)`
+     *   - `[d_values.Alternate(), d_values.Alternate() + num_items)`
      * - Upon completion, the sorting operation will update the "current" indicator
      *   within each DoubleBuffer wrapper to reference which of the two buffers
      *   now contains the sorted output sequence (a function of the number of key bits
@@ -290,6 +309,8 @@ struct DeviceRadixSort
         // Signed integer type for global offsets
         typedef int OffsetT;
 
+        constexpr bool is_overwrite_okay = true;
+
         return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
@@ -298,7 +319,7 @@ struct DeviceRadixSort
             num_items,
             begin_bit,
             end_bit,
-            true,
+            is_overwrite_okay,
             stream,
             debug_synchronous);
     }
@@ -308,7 +329,15 @@ struct DeviceRadixSort
      * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
      *
      * \par
-     * - The contents of the input data are not altered by the sorting operation
+     * - The contents of the input data are not altered by the sorting operation.
+     * - Pointers to contiguous memory must be used; iterators are not currently
+     *   supported.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys_in,    d_keys_in    + num_items)`
+     *   - `[d_keys_out,   d_keys_out   + num_items)`
+     *   - `[d_values_in,  d_values_in  + num_items)`
+     *   - `[d_values_out, d_values_out + num_items)`
      * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
      * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
      * - \devicestorage
@@ -372,6 +401,11 @@ struct DeviceRadixSort
         // Signed integer type for global offsets
         typedef int OffsetT;
 
+        // We cast away const-ness, but will *not* write to these arrays.
+        // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+        // create a new double-buffer internally when the `is_overwrite_ok` flag
+        // is not set.
+        constexpr bool is_overwrite_okay = false;
         DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
         DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
 
@@ -383,7 +417,7 @@ struct DeviceRadixSort
             num_items,
             begin_bit,
             end_bit,
-            false,
+            is_overwrite_okay,
             stream,
             debug_synchronous);
     }
@@ -399,6 +433,12 @@ struct DeviceRadixSort
      *   contains the input data to be sorted).
      * - The contents of both buffers within each pair may be altered by the sorting
      *   operation.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+     *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
+     *   - `[d_values.Current(),   d_values.Current()   + num_items)`
+     *   - `[d_values.Alternate(), d_values.Alternate() + num_items)`
      * - Upon completion, the sorting operation will update the "current" indicator
      *   within each DoubleBuffer wrapper to reference which of the two buffers
      *   now contains the sorted output sequence (a function of the number of key bits
@@ -466,6 +506,8 @@ struct DeviceRadixSort
         // Signed integer type for global offsets
         typedef int OffsetT;
 
+        constexpr bool is_overwrite_okay = true;
+
         return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
@@ -474,7 +516,7 @@ struct DeviceRadixSort
             num_items,
             begin_bit,
             end_bit,
-            true,
+            is_overwrite_okay,
             stream,
             debug_synchronous);
     }
@@ -491,7 +533,13 @@ struct DeviceRadixSort
      * \brief Sorts keys into ascending order. (~<em>2N </em>auxiliary storage required)
      *
      * \par
-     * - The contents of the input data are not altered by the sorting operation
+     * - The contents of the input data are not altered by the sorting operation.
+     * - Pointers to contiguous memory must be used; iterators are not currently
+     *   supported.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys_in,    d_keys_in    + num_items)`
+     *   - `[d_keys_out,   d_keys_out   + num_items)`
      * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
      * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
      * - \devicestorage
@@ -548,8 +596,13 @@ struct DeviceRadixSort
         // Signed integer type for global offsets
         typedef int OffsetT;
 
-        // Null value type
+        // We cast away const-ness, but will *not* write to these arrays.
+        // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+        // create a new double-buffer internally when the `is_overwrite_ok` flag
+        // is not set.
+        constexpr bool is_overwrite_okay = false;
         DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        // Null value type
         DoubleBuffer<NullType>  d_values;
 
         return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
@@ -560,7 +613,7 @@ struct DeviceRadixSort
             num_items,
             begin_bit,
             end_bit,
-            false,
+            is_overwrite_okay,
             stream,
             debug_synchronous);
     }
@@ -574,6 +627,10 @@ struct DeviceRadixSort
      *   DoubleBuffer structure that indicates which of the two buffers is
      *   "current" (and thus contains the input data to be sorted).
      * - The contents of both buffers may be altered by the sorting operation.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+     *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
      * - Upon completion, the sorting operation will update the "current" indicator
      *   within the DoubleBuffer wrapper to reference which of the two buffers
      *   now contains the sorted output sequence (a function of the number of key bits
@@ -636,6 +693,8 @@ struct DeviceRadixSort
         // Signed integer type for global offsets
         typedef int OffsetT;
 
+        constexpr bool is_overwrite_okay = true;
+
         // Null value type
         DoubleBuffer<NullType> d_values;
 
@@ -647,7 +706,7 @@ struct DeviceRadixSort
             num_items,
             begin_bit,
             end_bit,
-            true,
+            is_overwrite_okay,
             stream,
             debug_synchronous);
     }
@@ -656,7 +715,13 @@ struct DeviceRadixSort
      * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
      *
      * \par
-     * - The contents of the input data are not altered by the sorting operation
+     * - The contents of the input data are not altered by the sorting operation.
+     * - Pointers to contiguous memory must be used; iterators are not currently
+     *   supported.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys_in,    d_keys_in    + num_items)`
+     *   - `[d_keys_out,   d_keys_out   + num_items)`
      * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
      * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
      * - \devicestorage
@@ -712,6 +777,11 @@ struct DeviceRadixSort
         // Signed integer type for global offsets
         typedef int OffsetT;
 
+        // We cast away const-ness, but will *not* write to these arrays.
+        // `DispatchRadixSort::Dispatch` will allocate temporary storage and
+        // create a new double-buffer internally when the `is_overwrite_ok` flag
+        // is not set.
+        constexpr bool is_overwrite_okay = false;
         DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
         DoubleBuffer<NullType>  d_values;
 
@@ -723,7 +793,7 @@ struct DeviceRadixSort
             num_items,
             begin_bit,
             end_bit,
-            false,
+            is_overwrite_okay,
             stream,
             debug_synchronous);
     }
@@ -737,6 +807,10 @@ struct DeviceRadixSort
      *   DoubleBuffer structure that indicates which of the two buffers is
      *   "current" (and thus contains the input data to be sorted).
      * - The contents of both buffers may be altered by the sorting operation.
+     * - In-place operations are not supported. There must be no overlap between
+     *   any of the provided ranges:
+     *   - `[d_keys.Current(),     d_keys.Current()     + num_items)`
+     *   - `[d_keys.Alternate(),   d_keys.Alternate()   + num_items)`
      * - Upon completion, the sorting operation will update the "current" indicator
      *   within the DoubleBuffer wrapper to reference which of the two buffers
      *   now contains the sorted output sequence (a function of the number of key bits
@@ -795,6 +869,8 @@ struct DeviceRadixSort
         // Signed integer type for global offsets
         typedef int OffsetT;
 
+        constexpr bool is_overwrite_okay = true;
+
         // Null value type
         DoubleBuffer<NullType> d_values;
 
@@ -806,7 +882,7 @@ struct DeviceRadixSort
             num_items,
             begin_bit,
             end_bit,
-            true,
+            is_overwrite_okay,
             stream,
             debug_synchronous);
     }

From e4c3bbf687e2992f67fa5e881b12430d8c7fe8c5 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Tue, 8 Jun 2021 12:01:29 -0400
Subject: [PATCH 7/7] Add "Floating Point Special Cases" section to RadixSort
 docs.

---
 cub/block/block_radix_sort.cuh             |  7 +++++++
 cub/device/device_radix_sort.cuh           |  7 +++++++
 cub/device/device_segmented_radix_sort.cuh | 12 +++---------
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/cub/block/block_radix_sort.cuh b/cub/block/block_radix_sort.cuh
index 7919384ba4..1fa7184c11 100644
--- a/cub/block/block_radix_sort.cuh
+++ b/cub/block/block_radix_sort.cuh
@@ -79,6 +79,13 @@ namespace cub {
  * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
  * half-precision floating-point type.
  *
+ * \par Floating-Point Special Cases
+ *
+ * - Positive and negative zeros are considered equivalent, and will be treated
+ *   as such in the output.
+ * - No special handling is implemented for NaN values; these are sorted
+ *   according to their bit representations after any transformations.
+ *
  * \par Bitwise Key Transformations
  * Although the direct radix sorting method can only be applied to unsigned
  * integral types, BlockRadixSort is able to sort signed and floating-point
diff --git a/cub/device/device_radix_sort.cuh b/cub/device/device_radix_sort.cuh
index d9c4f3c723..59bb90725b 100644
--- a/cub/device/device_radix_sort.cuh
+++ b/cub/device/device_radix_sort.cuh
@@ -65,6 +65,13 @@ namespace cub {
  * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
  * half-precision floating-point type.
  *
+ * \par Floating-Point Special Cases
+ *
+ * - Positive and negative zeros are considered equivalent, and will be treated
+ *   as such in the output.
+ * - No special handling is implemented for NaN values; these are sorted
+ *   according to their bit representations after any transformations.
+ *
  * \par Transformations
  * Although the direct radix sorting method can only be applied to unsigned
  * integral types, DeviceRadixSort is able to sort signed and floating-point
diff --git a/cub/device/device_segmented_radix_sort.cuh b/cub/device/device_segmented_radix_sort.cuh
index c5c7fe9304..d4edc9c01b 100644
--- a/cub/device/device_segmented_radix_sort.cuh
+++ b/cub/device/device_segmented_radix_sort.cuh
@@ -60,15 +60,9 @@ namespace cub {
  * of the symbolic alphabet, the radix sorting method produces a lexicographic
  * ordering of those keys.
  *
- * \par
- * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types
- * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
- * half-precision floating-point type.  Although the direct radix sorting
- * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort
- * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering. DeviceSegmentedRadixSort is stable. For
- * floating-point types -0.0 and +0.0 are considered equal and appear in the
- * result in the same order as they appear in the input.
+ * \par See Also
+ * DeviceSegmentedRadixSort shares its implementation with DeviceRadixSort. See
+ * that algorithm's documentation for more information.
  *
  * \par Usage Considerations
  * \cdp_class{DeviceSegmentedRadixSort}