Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge pull request #607 from senior-zero/enh-main/github/sort_128
Browse files Browse the repository at this point in the history
Support __{u,}int128_t in radix sort
  • Loading branch information
gevtushenko authored Jan 7, 2023
2 parents 423f54e + 475f96f commit 3abfcc1
Show file tree
Hide file tree
Showing 7 changed files with 200 additions and 9 deletions.
16 changes: 16 additions & 0 deletions cub/util_ptx.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,22 @@ __device__ __forceinline__ unsigned int BFE(
return (source >> bit_start) & MASK;
}

#if CUB_IS_INT128_ENABLED
/**
* Bitfield-extract for 128-bit types.
*/
template <typename UnsignedBits>
__device__ __forceinline__ unsigned int BFE(
UnsignedBits source,
unsigned int bit_start,
unsigned int num_bits,
Int2Type<16> /*byte_len*/)
{
const __uint128_t MASK = (__uint128_t{1} << num_bits) - 1;
return (source >> bit_start) & MASK;
}
#endif

#endif // DOXYGEN_SHOULD_SKIP_THIS

/**
Expand Down
90 changes: 90 additions & 0 deletions cub/util_type.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,28 @@

#include <cub/detail/uninitialized_copy.cuh>
#include <cub/util_arch.cuh>
#include <cub/util_compiler.cuh>
#include <cub/util_deprecated.cuh>
#include <cub/util_macro.cuh>
#include <cub/util_namespace.cuh>

CUB_NAMESPACE_BEGIN

#ifndef CUB_IS_INT128_ENABLED
#if defined(__CUDACC_RTC__)
#if defined(__CUDACC_RTC_INT128__)
#define CUB_IS_INT128_ENABLED 1
#endif // !defined(__CUDACC_RTC_INT128__)
#else // !defined(__CUDACC_RTC__)
#if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11050)
#if (CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC) || \
(CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG) || \
defined(__ICC) || defined(_NVHPC_CUDA)
#define CUB_IS_INT128_ENABLED 1
#endif // GCC || CLANG || ICC || NVHPC
#endif // CTK >= 11.5
#endif // !defined(__CUDACC_RTC__)
#endif // !defined(CUB_IS_INT128_ENABLED)

/**
* \addtogroup UtilModule
Expand Down Expand Up @@ -1186,6 +1202,80 @@ template <> struct NumericTraits<unsigned int> : BaseTraits<UNSIGNED_INTE
template <> struct NumericTraits<unsigned long> : BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long, unsigned long> {};
template <> struct NumericTraits<unsigned long long> : BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long, unsigned long long> {};


#if CUB_IS_INT128_ENABLED
template <>
struct NumericTraits<__uint128_t>
{
using T = __uint128_t;
using UnsignedBits = __uint128_t;

static constexpr Category CATEGORY = UNSIGNED_INTEGER;
static constexpr UnsignedBits LOWEST_KEY = UnsignedBits(0);
static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1);

static constexpr bool PRIMITIVE = false;
static constexpr bool NULL_TYPE = false;

static __host__ __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
{
return key;
}

static __host__ __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
{
return key;
}

static __host__ __device__ __forceinline__ T Max()
{
return MAX_KEY;
}

static __host__ __device__ __forceinline__ T Lowest()
{
return LOWEST_KEY;
}
};

template <>
struct NumericTraits<__int128_t>
{
using T = __int128_t;
using UnsignedBits = __uint128_t;

static constexpr Category CATEGORY = SIGNED_INTEGER;
static constexpr UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
static constexpr UnsignedBits LOWEST_KEY = HIGH_BIT;
static constexpr UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT;

static constexpr bool PRIMITIVE = false;
static constexpr bool NULL_TYPE = false;

static __host__ __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
{
return key ^ HIGH_BIT;
};

static __host__ __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
{
return key ^ HIGH_BIT;
};

static __host__ __device__ __forceinline__ T Max()
{
UnsignedBits retval = MAX_KEY;
return reinterpret_cast<T&>(retval);
}

static __host__ __device__ __forceinline__ T Lowest()
{
UnsignedBits retval = LOWEST_KEY;
return reinterpret_cast<T&>(retval);
}
};
#endif

template <> struct NumericTraits<float> : BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
template <> struct NumericTraits<double> : BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !_NVHPC_CUDA
Expand Down
36 changes: 36 additions & 0 deletions test/catch2_test_printing.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#include "test_util.h"

#include "catch2_test_helper.h"

template <typename T>
std::string print(T val)
{
std::stringstream ss;
ss << val;
return ss.str();
}

#if CUB_IS_INT128_ENABLED
TEST_CASE("Test utils can print __int128", "[test][utils]")
{
REQUIRE( print(__int128_t{0}) == "0" );
REQUIRE( print(__int128_t{42}) == "42" );
REQUIRE( print(__int128_t{-1}) == "-1" );
REQUIRE( print(__int128_t{-42}) == "-42" );
REQUIRE( print(__int128_t{-1} << 120) == "-1329227995784915872903807060280344576" );
}

TEST_CASE("Test utils can print __uint128", "[test][utils]")
{
REQUIRE( print(__uint128_t{0}) == "0" );
REQUIRE( print(__uint128_t{1}) == "1" );
REQUIRE( print(__uint128_t{42}) == "42" );
REQUIRE( print(__uint128_t{1} << 120) == "1329227995784915872903807060280344576" );
}
#endif

TEST_CASE("Test utils can print KeyValuePair", "[test][utils]")
{
REQUIRE( print(cub::KeyValuePair<int, int>{42, -42}) == "(42,-42)" );
}

6 changes: 3 additions & 3 deletions test/test_block_radix_rank.cu
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,9 @@ void TestDriver(GenMode gen_mode)
constexpr int tile_size = BlockThreads * ItemsPerThread;

// Allocate host arrays
std::unique_ptr<Key> h_keys(new Key[tile_size]);
std::unique_ptr<int> h_ranks(new int[tile_size]);
std::unique_ptr<int> h_reference_ranks(new int[tile_size]);
std::unique_ptr<Key[]> h_keys(new Key[tile_size]);
std::unique_ptr<int[]> h_ranks(new int[tile_size]);
std::unique_ptr<int[]> h_reference_ranks(new int[tile_size]);

// Allocate device arrays
Key *d_keys = nullptr;
Expand Down
6 changes: 3 additions & 3 deletions test/test_device_batch_memcpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -312,9 +312,9 @@ void RunTest(BufferOffsetT num_buffers,
using RandomInitAliasT = uint16_t;
std::size_t num_aliased_factor = sizeof(RandomInitAliasT) / sizeof(uint8_t);
std::size_t num_aliased_units = CUB_QUOTIENT_CEILING(num_total_bytes, num_aliased_factor);
std::unique_ptr<uint8_t> h_in(new uint8_t[num_aliased_units * num_aliased_factor]);
std::unique_ptr<uint8_t> h_out(new uint8_t[num_total_bytes]);
std::unique_ptr<uint8_t> h_gpu_results(new uint8_t[num_total_bytes]);
std::unique_ptr<uint8_t[]> h_in(new uint8_t[num_aliased_units * num_aliased_factor]);
std::unique_ptr<uint8_t[]> h_out(new uint8_t[num_total_bytes]);
std::unique_ptr<uint8_t[]> h_gpu_results(new uint8_t[num_total_bytes]);

// Generate random offsets into the random-bits data buffer
GenerateRandomData(reinterpret_cast<RandomInitAliasT *>(h_in.get()), num_aliased_units);
Expand Down
18 changes: 15 additions & 3 deletions test/test_device_radix_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -904,9 +904,11 @@ void InitializeSolution(
// Mask off unwanted portions
if (num_bits < static_cast<int>(sizeof(KeyT) * 8))
{
unsigned long long base = 0;
using UnsignedBits = typename cub::Traits<KeyT>::UnsignedBits;

UnsignedBits base = 0;
memcpy(&base, &h_keys[i], sizeof(KeyT));
base &= ((1ull << num_bits) - 1) << begin_bit;
base &= ((UnsignedBits{1} << num_bits) - 1) << begin_bit;
memcpy(&h_pairs[i].key, &base, sizeof(KeyT));
}
else
Expand Down Expand Up @@ -1929,7 +1931,7 @@ int main(int argc, char** argv)
CubDebugExit(args.DeviceInit());

// %PARAM% TEST_CDP cdp 0:1
// %PARAM% TEST_KEY_BYTES bytes 1:2:4:8
// %PARAM% TEST_KEY_BYTES bytes 1:2:4:8:16
// %PARAM% TEST_VALUE_TYPE pairs 0:1:2:3
// 0->Keys only
// 1->uchar
Expand Down Expand Up @@ -1995,6 +1997,16 @@ int main(int argc, char** argv)
TestGen<unsigned long long, false>(num_items, num_segments);
#endif // TEST_EXTENDED_KEY_TYPES

#elif TEST_KEY_BYTES == 16

#if CUB_IS_INT128_ENABLED
TestGen<__int128_t, false>(num_items, num_segments);
TestGen<__uint128_t, false>(num_items, num_segments);
#else
// Fix unused static function for MSVC
BackendToString(CUB);
#endif

#endif // TEST_KEY_BYTES switch

return 0;
Expand Down
37 changes: 37 additions & 0 deletions test/test_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,43 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair<
return os;
}

#if CUB_IS_INT128_ENABLED
static std::ostream& operator<<(std::ostream& os, __uint128_t val)
{
constexpr int max_digits = 40;
char buffer[max_digits] = {};
char* digit = buffer + max_digits;
const char* ascii = "0123456789";

do
{
digit--;
*digit = ascii[val % 10];
val /= 10;
}
while(val != 0);

for (; digit != buffer + max_digits; digit++) {
os << *digit;
}

return os;
}

static std::ostream& operator<<(std::ostream& os, __int128_t val)
{
if (val < 0) {
__uint128_t tmp = -val;
os << '-' << tmp;
} else {
__uint128_t tmp = val;
os << tmp;
}

return os;
}
#endif


/******************************************************************************
* Comparison and ostream operators for CUDA vector types
Expand Down

0 comments on commit 3abfcc1

Please sign in to comment.