diff --git a/packages/kokkos-kernels/src/common/KokkosKernels_Sorting.hpp b/packages/kokkos-kernels/src/common/KokkosKernels_Sorting.hpp new file mode 100644 index 000000000000..a43d5573dd86 --- /dev/null +++ b/packages/kokkos-kernels/src/common/KokkosKernels_Sorting.hpp @@ -0,0 +1,572 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef _KOKKOSKERNELS_SORTING_HPP +#define _KOKKOSKERNELS_SORTING_HPP +#include "Kokkos_Core.hpp" +#include "Kokkos_Atomic.hpp" +#include "Kokkos_ArithTraits.hpp" +#include "impl/Kokkos_Timer.hpp" +#include + +namespace KokkosKernels { +namespace Impl { + +//Radix sort for integers, on a single thread within a team. +//Pros: few diverging branches, so OK for sorting on a single GPU vector lane. Better on CPU cores. +//Con: requires auxiliary storage, and this version only works for integers +template +KOKKOS_INLINE_FUNCTION void +SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n) +{ + static_assert(std::is_integral::value && std::is_unsigned::value, + "radixSort can only be run on unsigned integers."); + if(n <= 1) + return; + ValueType maxVal = 0; + for(Ordinal i = 0; i < n; i++) + { + if(maxVal < values[i]) + maxVal = values[i]; + } + //determine how many significant bits the data has + int passes = 0; + while(maxVal) + { + maxVal >>= 4; + passes++; + } + //Is the data currently held in values (false) or valuesAux (true)? + bool inAux = false; + //sort 4 bits at a time, into 16 buckets + ValueType mask = 0xF; + //maskPos counts the low bit index of mask (0, 4, 8, ...) + Ordinal maskPos = 0; + for(int p = 0; p < passes; p++) + { + //Count the number of elements in each bucket + Ordinal count[16] = {0}; + Ordinal offset[17]; + if(!inAux) + { + for(Ordinal i = 0; i < n; i++) + { + count[(values[i] & mask) >> maskPos]++; + } + } + else + { + for(Ordinal i = 0; i < n; i++) + { + count[(valuesAux[i] & mask) >> maskPos]++; + } + } + offset[0] = 0; + //get offset as the prefix sum for count + for(Ordinal i = 0; i < 16; i++) + { + offset[i + 1] = offset[i] + count[i]; + } + //now for each element in [lo, hi), move it to its offset in the other buffer + //this branch should be ok because whichBuf is the same on all threads + if(!inAux) + { + for(Ordinal i = 0; i < n; i++) + { + Ordinal bucket = (values[i] & mask) >> maskPos; + valuesAux[offset[bucket + 1] - count[bucket]] = values[i]; + count[bucket]--; + } + } + else + { + for(Ordinal i = 0; i < n; i++) + { + Ordinal bucket = (valuesAux[i] & mask) >> maskPos; + values[offset[bucket + 1] - count[bucket]] = valuesAux[i]; + count[bucket]--; + } + } + inAux = !inAux; + mask = mask << 4; + maskPos += 4; + } + //Move values back into main array if they are currently in aux. + //This is the case if an odd number of rounds were done. + if(inAux) + { + for(Ordinal i = 0; i < n; i++) + { + values[i] = valuesAux[i]; + } + } +} + +//Radix sort for integers (no internal parallelism). +//While sorting, also permute "perm" array along with the values. +//Pros: few diverging branches, so good for sorting on a single GPU vector lane. +//Con: requires auxiliary storage, this version only works for integers (although float/double is possible) +template +KOKKOS_INLINE_FUNCTION void +SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, PermType* permAux, Ordinal n) +{ + static_assert(std::is_integral::value && std::is_unsigned::value, + "radixSort can only be run on unsigned integers."); + if(n <= 1) + return; + ValueType maxVal = 0; + for(Ordinal i = 0; i < n; i++) + { + if(maxVal < values[i]) + maxVal = values[i]; + } + int passes = 0; + while(maxVal) + { + maxVal >>= 4; + passes++; + } + //Is the data currently held in values (false) or valuesAux (true)? + bool inAux = false; + //sort 4 bits at a time, into 16 buckets + ValueType mask = 0xF; + //maskPos counts the low bit index of mask (0, 4, 8, ...) + Ordinal maskPos = 0; + for(int p = 0; p < passes; p++) + { + //Count the number of elements in each bucket + Ordinal count[16] = {0}; + Ordinal offset[17]; + if(!inAux) + { + for(Ordinal i = 0; i < n; i++) + { + count[(values[i] & mask) >> maskPos]++; + } + } + else + { + for(Ordinal i = 0; i < n; i++) + { + count[(valuesAux[i] & mask) >> maskPos]++; + } + } + offset[0] = 0; + //get offset as the prefix sum for count + for(Ordinal i = 0; i < 16; i++) + { + offset[i + 1] = offset[i] + count[i]; + } + //now for each element in [lo, hi), move it to its offset in the other buffer + //this branch should be ok because whichBuf is the same on all threads + if(!inAux) + { + for(Ordinal i = 0; i < n; i++) + { + Ordinal bucket = (values[i] & mask) >> maskPos; + valuesAux[offset[bucket + 1] - count[bucket]] = values[i]; + permAux[offset[bucket + 1] - count[bucket]] = perm[i]; + count[bucket]--; + } + } + else + { + for(Ordinal i = 0; i < n; i++) + { + Ordinal bucket = (valuesAux[i] & mask) >> maskPos; + values[offset[bucket + 1] - count[bucket]] = valuesAux[i]; + perm[offset[bucket + 1] - count[bucket]] = permAux[i]; + count[bucket]--; + } + } + inAux = !inAux; + mask = mask << 4; + maskPos += 4; + } + //Move values back into main array if they are currently in aux. + //This is the case if an odd number of rounds were done. + if(inAux) + { + for(Ordinal i = 0; i < n; i++) + { + values[i] = valuesAux[i]; + perm[i] = permAux[i]; + } + } +} + +template +struct DefaultComparator +{ + KOKKOS_INLINE_FUNCTION bool operator()(const Value lhs, const Value rhs) const + { + return lhs < rhs; + } +}; + +//Bitonic merge sort (requires only comparison operators and trivially-copyable) +//Pros: In-place, plenty of parallelism for GPUs, and memory references are coalesced +//Con: O(n log^2(n)) serial time is bad on CPUs +//Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter +template> +KOKKOS_INLINE_FUNCTION void +TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem) +{ + //Algorithm only works on power-of-two input size only. + //If n is not a power-of-two, will implicitly pretend + //that values[i] for i >= n is just the max for ValueType, so it never gets swapped + Ordinal npot = 1; + Ordinal levels = 0; + while(npot < n) + { + levels++; + npot <<= 1; + } + for(Ordinal i = 0; i < levels; i++) + { + for(Ordinal j = 0; j <= i; j++) + { + // n/2 pairs of items are compared in parallel + Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), + [=](const Ordinal t) + { + //How big are the brown/pink boxes? + Ordinal boxSize = Ordinal(2) << (i - j); + //Which box contains this thread? + Ordinal boxID = t >> (i - j); //t * 2 / boxSize; + Ordinal boxStart = boxID << (1 + i - j); //boxID * boxSize + Ordinal boxOffset = t - (boxStart >> 1); //t - boxID * boxSize / 2; + Ordinal elem1 = boxStart + boxOffset; + Comparator comp; + if(j == 0) + { + //first phase (brown box): within a block, compare with the opposite value in the box + Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; + if(elem2 < n) + { + //both elements in bounds, so compare them and swap if out of order + if(comp(values[elem2], values[elem1])) + { + ValueType temp = values[elem1]; + values[elem1] = values[elem2]; + values[elem2] = temp; + } + } + } + else + { + //later phases (pink box): within a block, compare with fixed distance (boxSize / 2) apart + Ordinal elem2 = elem1 + boxSize / 2; + if(elem2 < n) + { + if(comp(values[elem2], values[elem1])) + { + ValueType temp = values[elem1]; + values[elem1] = values[elem2]; + values[elem2] = temp; + } + } + } + }); + mem.team_barrier(); + } + } +} + +//Sort "values", while applying the same swaps to "perm" +template> +KOKKOS_INLINE_FUNCTION void +TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem) +{ + //Algorithm only works on power-of-two input size only. + //If n is not a power-of-two, will implicitly pretend + //that values[i] for i >= n is just the max for ValueType, so it never gets swapped + Ordinal npot = 1; + Ordinal levels = 0; + while(npot < n) + { + levels++; + npot <<= 1; + } + for(Ordinal i = 0; i < levels; i++) + { + for(Ordinal j = 0; j <= i; j++) + { + // n/2 pairs of items are compared in parallel + Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), + [=](const Ordinal t) + { + //How big are the brown/pink boxes? + Ordinal boxSize = Ordinal(2) << (i - j); + //Which box contains this thread? + Ordinal boxID = t >> (i - j); //t * 2 / boxSize; + Ordinal boxStart = boxID << (1 + i - j); //boxID * boxSize + Ordinal boxOffset = t - (boxStart >> 1); //t - boxID * boxSize / 2; + Ordinal elem1 = boxStart + boxOffset; + Comparator comp; + if(j == 0) + { + //first phase (brown box): within a block, compare with the opposite value in the box + Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; + if(elem2 < n) + { + //both elements in bounds, so compare them and swap if out of order + if(comp(values[elem2], values[elem1])) + { + ValueType temp1 = values[elem1]; + values[elem1] = values[elem2]; + values[elem2] = temp1; + PermType temp2 = perm[elem1]; + perm[elem1] = perm[elem2]; + perm[elem2] = temp2; + } + } + } + else + { + //later phases (pink box): within a block, compare with fixed distance (boxSize / 2) apart + Ordinal elem2 = elem1 + boxSize / 2; + if(elem2 < n) + { + if(comp(values[elem2], values[elem1])) + { + ValueType temp1 = values[elem1]; + values[elem1] = values[elem2]; + values[elem2] = temp1; + PermType temp2 = perm[elem1]; + perm[elem1] = perm[elem2]; + perm[elem2] = temp2; + } + } + } + }); + mem.team_barrier(); + } + } +} + +//Functor that sorts a view on one team +template +struct BitonicSingleTeamFunctor +{ + BitonicSingleTeamFunctor(View& v_) : v(v_) {} + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const + { + TeamBitonicSort(v.data(), v.extent(0), t); + }; + View v; +}; + +//Functor that sorts equally sized chunks on each team +template +struct BitonicChunkFunctor +{ + BitonicChunkFunctor(View& v_, Ordinal chunkSize_) : v(v_), chunkSize(chunkSize_) {} + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const + { + Ordinal chunk = t.league_rank(); + Ordinal chunkStart = chunk * chunkSize; + Ordinal n = chunkSize; + if(chunkStart + n > Ordinal(v.extent(0))) + n = v.extent(0) - chunkStart; + TeamBitonicSort(v.data() + chunkStart, n, t); + }; + View v; + Ordinal chunkSize; +}; + +//Functor that does just the first phase (brown) of bitonic sort on equally-sized chunks +template +struct BitonicPhase1Functor +{ + typedef typename View::value_type Value; + BitonicPhase1Functor(View& v_, Ordinal boxSize_, Ordinal teamsPerBox_) + : v(v_), boxSize(boxSize_), teamsPerBox(teamsPerBox_) + {} + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const + { + Ordinal box = t.league_rank() / teamsPerBox; + Ordinal boxStart = boxSize * box; + Ordinal work = boxSize / teamsPerBox / 2; + Ordinal workStart = work * (t.league_rank() % teamsPerBox); + Ordinal workReflect = boxSize - workStart - 1; + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), + [=](const Ordinal i) + { + Comparator comp; + Ordinal elem1 = boxStart + workStart + i; + Ordinal elem2 = boxStart + workReflect - i; + if(elem2 < Ordinal(v.extent(0))) + { + if(comp(v(elem2), v(elem1))) + { + Value temp = v(elem1); + v(elem1) = v(elem2); + v(elem2) = temp; + } + } + }); + }; + View v; + Ordinal boxSize; + Ordinal teamsPerBox; +}; + +//Functor that does the second phase (red) of bitonic sort +template +struct BitonicPhase2Functor +{ + typedef typename View::value_type Value; + BitonicPhase2Functor(View& v_, Ordinal boxSize_, Ordinal teamsPerBox_) + : v(v_), boxSize(boxSize_), teamsPerBox(teamsPerBox_) + {} + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const + { + Ordinal logBoxSize = 1; + while((Ordinal(1) << logBoxSize) < boxSize) + logBoxSize++; + Ordinal box = t.league_rank() / teamsPerBox; + Ordinal boxStart = boxSize * box; + Ordinal work = boxSize / teamsPerBox / 2; + Ordinal workStart = boxStart + work * (t.league_rank() % teamsPerBox); + Ordinal jump = boxSize / 2; + Comparator comp; + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), + [=](const Ordinal i) + { + Ordinal elem1 = workStart + i; + Ordinal elem2 = workStart + jump + i; + if(elem2 < Ordinal(v.extent(0))) + { + if(comp(v(elem2), v(elem1))) + { + Value temp = v(elem1); + v(elem1) = v(elem2); + v(elem2) = temp; + } + } + }); + if(teamsPerBox == 1) + { + //This team can finish phase 2 for all the smaller red boxes that follow, + //since there are no longer cross-team data dependencies + for(Ordinal subLevel = 1; subLevel < logBoxSize; subLevel++) + { + t.team_barrier(); + Ordinal logSubBoxSize = logBoxSize - subLevel; + Ordinal subBoxSize = Ordinal(1) << logSubBoxSize; + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), + [=](const Ordinal i) + { + Ordinal globalThread = i + t.league_rank() * work; + Ordinal subBox = globalThread >> (logSubBoxSize - 1); + Ordinal subBoxStart = subBox << logSubBoxSize; + Ordinal subBoxOffset = globalThread & ((Ordinal(1) << (logSubBoxSize - 1)) - 1); //i % (subBoxSize / 2) + Ordinal elem1 = subBoxStart + subBoxOffset; + //later phases (pink box): within a block, compare with fixed distance (boxSize / 2) apart + Ordinal elem2 = elem1 + subBoxSize / 2; + if(elem2 < Ordinal(v.extent(0))) + { + if(comp(v(elem2), v(elem1))) + { + Value temp = v(elem1); + v(elem1) = v(elem2); + v(elem2) = temp; + } + } + }); + } + } + }; + View v; + Ordinal boxSize; + Ordinal teamsPerBox; +}; + +//Version to be called from host on a single array +//Generally ~2x slower than Kokkos::sort() for large arrays (> 50 M elements), +//but faster for smaller arrays. +// +//This is more general than BinSort: bitonic supports any trivially copyable type +//and an arbitrary device-compatible comparison operator (provided through operator() of Comparator) +//If comparator is void, use operator< (which should only be used for primitives) +template> +void bitonicSort(View v) +{ + typedef Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + Ordinal n = v.extent(0); + //If n is small, just sort on a single team + if(n <= Ordinal(1) << 16) + { + Kokkos::parallel_for(team_policy(1, Kokkos::AUTO()), + BitonicSingleTeamFunctor(v)); + } + else + { + Ordinal npot = 1; + while(npot < n) + npot <<= 1; + //Partition the data equally among fixed number of teams + Ordinal chunkSize = 512; + Ordinal numTeams = npot / chunkSize; + //First, sort within teams + Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), + BitonicChunkFunctor(v, chunkSize)); + for(int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2) + { + Ordinal boxSize = teamsPerBox * chunkSize; + Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), + BitonicPhase1Functor(v, boxSize, teamsPerBox)); + for(int boxDiv = 1; teamsPerBox >> boxDiv; boxDiv++) + { + Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), + BitonicPhase2Functor(v, boxSize >> boxDiv, teamsPerBox >> boxDiv)); + } + } + } +} + +}} + +#endif + diff --git a/packages/kokkos-kernels/test_common/Test_Common_Sorting.hpp b/packages/kokkos-kernels/test_common/Test_Common_Sorting.hpp new file mode 100644 index 000000000000..a0b1f78225e9 --- /dev/null +++ b/packages/kokkos-kernels/test_common/Test_Common_Sorting.hpp @@ -0,0 +1,603 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Test_Common_Sorting.hpp +/// \brief Tests for radixSort and bitonicSort in KokkoKernels_Sorting.hpp + +#ifndef KOKKOSKERNELS_SORTINGTEST_HPP +#define KOKKOSKERNELS_SORTINGTEST_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +//Generate n randomized counts with mean . +//Then prefix-sum into randomOffsets. +//This simulates a CRS rowmap or other batched sorting scenario +template +size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets, size_t n, size_t avg) +{ + srand(54321); + auto countsHost = Kokkos::create_mirror_view(randomCounts); + size_t total = 0; + for(size_t i = 0; i < n; i++) + { + if(avg == 0) + countsHost(i) = 0; + else + countsHost(i) = 0.5 + rand() % (avg * 2); + total += countsHost(i); + } + Kokkos::deep_copy(randomCounts, countsHost); + Kokkos::deep_copy(randomOffsets, randomCounts); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(n, randomOffsets); + return total; +} + +struct Coordinates +{ + KOKKOS_INLINE_FUNCTION Coordinates() + { + x = 0; + y = 0; + z = 0; + } + KOKKOS_INLINE_FUNCTION Coordinates(double x_, double y_, double z_) + { + x = x_; + y = y_; + z = z_; + } + double x; + double y; + double z; +}; + +/* +getRandom generates a random object for sorting. +the general version just produces integers - below +are some specializations +*/ + +template +T getRandom() +{ + return rand() % Kokkos::ArithTraits::max(); +} + +//Generate a uniform double between (-5, 5) +template<> +double getRandom() +{ + return -5 + (10.0 * rand()) / RAND_MAX; +} + +template<> +Coordinates getRandom() +{ + return Coordinates(getRandom(), getRandom(), getRandom()); +} + +//Specialize for Kokkos::complex, with the real and imaginary parts different +template +struct kvHash +{ + Value operator()(const Key& k) + { + return (Value) (3 * k + 4); + } +}; + +template +struct kvHash> +{ + Kokkos::complex operator()(const Key& k) + { + return Kokkos::complex(3 * k + 4, k - 10.4); + } +}; + +template +void fillRandom(View v) +{ + srand(12345); + typedef typename View::value_type Value; + auto vhost = Kokkos::create_mirror_view(v); + for(size_t i = 0; i < v.extent(0); i++) + vhost(i) = getRandom(); + Kokkos::deep_copy(v, vhost); +} + +template +void fillRandom(KeyView k, ValView v) +{ + srand(23456); + typedef typename KeyView::value_type Key; + typedef typename ValView::value_type Value; + auto khost = Kokkos::create_mirror_view(k); + auto vhost = Kokkos::create_mirror_view(v); + for(size_t i = 0; i < v.extent(0); i++) + { + khost(i) = getRandom(); + vhost(i) = kvHash()(khost(i)); + } + Kokkos::deep_copy(k, khost); + Kokkos::deep_copy(v, vhost); +} + +template +struct TestSerialRadixFunctor +{ + using Key = typename KeyView::value_type; + using UnsignedKey = typename std::make_unsigned::type; + + TestSerialRadixFunctor(KeyView& keys_, KeyView& keysAux_, OrdView& counts_, OrdView& offsets_) + : keys(keys_), keysAux(keysAux_), counts(counts_), offsets(offsets_) + {} + KOKKOS_INLINE_FUNCTION void operator()(const int i) const + { + int off = offsets(i); + KokkosKernels::Impl::SerialRadixSort( + (UnsignedKey*) keys.data() + off, (UnsignedKey*) keysAux.data() + off, counts(i)); + } + KeyView keys; + KeyView keysAux; + OrdView counts; + OrdView offsets; +}; + +template +struct TestSerialRadix2Functor +{ + //Sort by keys, while permuting values + using Key = typename KeyView::value_type; + using UnsignedKey = typename std::make_unsigned::type; + using Value = typename ValView::value_type; + + TestSerialRadix2Functor(KeyView& keys_, KeyView& keysAux_, ValView& values_, ValView& valuesAux_, OrdView& counts_, OrdView& offsets_) + : keys(keys_), keysAux(keysAux_), values(values_), valuesAux(valuesAux_), counts(counts_), offsets(offsets_) + {} + KOKKOS_INLINE_FUNCTION void operator()(const int i) const + { + int off = offsets(i); + KokkosKernels::Impl::SerialRadixSort2( + (UnsignedKey*) keys.data() + off, (UnsignedKey*) keysAux.data() + off, + values.data() + off, valuesAux.data() + off, counts(i)); + } + KeyView keys; + KeyView keysAux; + ValView values; + ValView valuesAux; + OrdView counts; + OrdView offsets; +}; + +template +void testSerialRadixSort(size_t k, size_t subArraySize) +{ + //Create a view of randomized data + typedef typename ExecSpace::memory_space mem_space; + typedef Kokkos::View OrdView; + typedef Kokkos::View KeyView; + OrdView counts("Subarray Sizes", k); + OrdView offsets("Subarray Offsets", k); + //Generate k sub-array sizes, each with size about 20 + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); + KeyView keys("Radix sort testing data", n); + fillRandom(keys); + //Sort using std::sort on host to do correctness test + Kokkos::View gold("Host sorted", n); + Kokkos::deep_copy(gold, keys); + for(size_t i = 0; i < k; i++) + { + Key* begin = gold.data() + offsetsHost(i); + Key* end = begin + countsHost(i); + std::sort(begin, end); + } + KeyView keysAux("Radix sort aux data", n); + //Run the sorting on device in all sub-arrays in parallel + typedef Kokkos::RangePolicy range_policy; + Kokkos::parallel_for(range_policy(0, k), + TestSerialRadixFunctor(keys, keysAux, counts, offsets)); + //Copy result to host + auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); + for(size_t i = 0; i < n; i++) + { + ASSERT_EQ(keysHost(i), gold(i)); + } +} + +template +void testSerialRadixSort2(size_t k, size_t subArraySize) +{ + //Create a view of randomized data + typedef typename ExecSpace::memory_space mem_space; + typedef Kokkos::View OrdView; + typedef Kokkos::View KeyView; + typedef Kokkos::View ValView; + OrdView counts("Subarray Sizes", k); + OrdView offsets("Subarray Offsets", k); + //Generate k sub-array sizes, each with size about 20 + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); + KeyView keys("Radix test keys", n); + ValView data("Radix test data", n); + //The keys are randomized + fillRandom(keys, data); + KeyView keysAux("Radix sort aux keys", n); + ValView dataAux("Radix sort aux data", n); + //Run the sorting on device in all sub-arrays in parallel + typedef Kokkos::RangePolicy range_policy; + //Deliberately using a weird number for vector length + Kokkos::parallel_for(range_policy(0, k), + TestSerialRadix2Functor(keys, keysAux, data, dataAux, counts, offsets)); + //Sort using std::sort on host to do correctness test + Kokkos::View gold("Host sorted", n); + Kokkos::deep_copy(gold, keys); + for(size_t i = 0; i < k; i++) + { + Key* begin = gold.data() + offsetsHost(i); + Key* end = begin + countsHost(i); + std::sort(begin, end); + } + //Copy results to host + auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); + auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); + //Make sure keys are sorted exactly (stability of sort doesn't matter) + for(size_t i = 0; i < n; i++) + { + ASSERT_EQ(keysHost(i), gold(i)); + } + //Make sure the hashes of each key still matches the corresponding value + for(size_t i = 0; i < n; i++) + { + auto correctHash = kvHash()(keysHost(i)); + ASSERT_EQ(dataHost(i), correctHash); + } +} + +template +struct TestTeamBitonicFunctor +{ + typedef typename ValView::value_type Value; + + TestTeamBitonicFunctor(ValView& values_, OrdView& counts_, OrdView& offsets_) + : values(values_), counts(counts_), offsets(offsets_) + {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const + { + int i = t.league_rank(); + KokkosKernels::Impl::TeamBitonicSort(values.data() + offsets(i), counts(i), t); + } + + ValView values; + OrdView counts; + OrdView offsets; +}; + +template +struct TestTeamBitonic2Functor +{ + typedef typename KeyView::value_type Key; + typedef typename ValView::value_type Value; + + TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_, OrdView& offsets_) + : keys(keys_), values(values_), counts(counts_), offsets(offsets_) + {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const + { + int i = t.league_rank(); + KokkosKernels::Impl::TeamBitonicSort2(keys.data() + offsets(i), values.data() + offsets(i), counts(i), t); + } + + KeyView keys; + ValView values; + OrdView counts; + OrdView offsets; +}; + +template +void testTeamBitonicSort(size_t k, size_t subArraySize) +{ + //Create a view of randomized data + typedef typename ExecSpace::memory_space mem_space; + typedef Kokkos::View OrdView; + typedef Kokkos::View ValView; + OrdView counts("Subarray Sizes", k); + OrdView offsets("Subarray Offsets", k); + //Generate k sub-array sizes, each with size about 20 + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); + ValView data("Bitonic sort testing data", n); + fillRandom(data); + //Run the sorting on device in all sub-arrays in parallel + Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), + TestTeamBitonicFunctor(data, counts, offsets)); + //Copy result to host + auto dataHost = Kokkos::create_mirror_view(data); + Kokkos::deep_copy(dataHost, data); + //Sort using std::sort on host to do correctness test + Kokkos::View gold("Host sorted", n); + Kokkos::deep_copy(gold, data); + for(size_t i = 0; i < k; i++) + { + Scalar* begin = gold.data() + offsetsHost(i); + Scalar* end = begin + countsHost(i); + std::sort(begin, end); + } + for(size_t i = 0; i < n; i++) + { + ASSERT_EQ(dataHost(i), gold(i)); + } +} + +template +void testTeamBitonicSort2(size_t k, size_t subArraySize) +{ + //Create a view of randomized data + typedef typename ExecSpace::memory_space mem_space; + typedef Kokkos::View OrdView; + typedef Kokkos::View KeyView; + typedef Kokkos::View ValView; + OrdView counts("Subarray Sizes", k); + OrdView offsets("Subarray Offsets", k); + //Generate k sub-array sizes, each with size about 20 + size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); + auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); + auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); + KeyView keys("Bitonic test keys", n); + ValView data("Bitonic test data", n); + //The keys are randomized + fillRandom(keys, data); + //Run the sorting on device in all sub-arrays in parallel, just using vector loops + //Deliberately using a weird number for vector length + Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), + TestTeamBitonic2Functor(keys, data, counts, offsets)); + //Sort using std::sort on host to do correctness test + Kokkos::View gold("Host sorted", n); + Kokkos::deep_copy(gold, keys); + for(size_t i = 0; i < k; i++) + { + Key* begin = gold.data() + offsetsHost(i); + Key* end = begin + countsHost(i); + std::sort(begin, end); + } + //Copy results to host + auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); + auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); + //Make sure keys are sorted exactly (stability of sort doesn't matter) + for(size_t i = 0; i < n; i++) + { + ASSERT_EQ(keysHost(i), gold(i)); + } + //Make sure the hashes of each key still matches the corresponding value + for(size_t i = 0; i < n; i++) + { + auto correctHash = kvHash()(keysHost(i)); + ASSERT_EQ(dataHost(i), correctHash); + } +} + +template +struct CheckSortedFunctor +{ + CheckSortedFunctor(View& v_) + : v(v_) {} + KOKKOS_INLINE_FUNCTION void operator()(int i, int& lval) const + { + if(v(i) > v(i + 1)) + lval = 0; + } + View v; +}; + +template +void testBitonicSort(size_t n) +{ + //Create a view of randomized data + typedef typename ExecSpace::memory_space mem_space; + typedef Kokkos::View ValView; + ValView data("Bitonic sort testing data", n); + fillRandom(data); + KokkosKernels::Impl::bitonicSort(data); + int ordered = 1; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), + CheckSortedFunctor(data), Kokkos::Min(ordered)); + ASSERT_TRUE(ordered); +} + +//Check that the view is weakly ordered according to Comparator: +//Comparator never says that element i+1 belongs before element i. +template +struct CheckOrderedFunctor +{ + CheckOrderedFunctor(View& v_) + : v(v_) {} + KOKKOS_INLINE_FUNCTION void operator()(int i, int& lval) const + { + Comparator comp; + if(comp(v(i + 1), v(i))) + lval = 0; + } + View v; +}; + +template +struct CompareDescending +{ + KOKKOS_INLINE_FUNCTION bool operator()(const Scalar lhs, const Scalar rhs) const + { + return lhs > rhs; + } +}; + +template +void testBitonicSortDescending() +{ + typedef char Scalar; + typedef CompareDescending Comp; + //Create a view of randomized data + typedef typename ExecSpace::memory_space mem_space; + typedef Kokkos::View ValView; + size_t n = 12521; + ValView data("Bitonic sort testing data", n); + fillRandom(data); + KokkosKernels::Impl::bitonicSort(data); + int ordered = 1; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), + CheckOrderedFunctor(data), Kokkos::Min(ordered)); + ASSERT_TRUE(ordered); +} + +struct LexCompare +{ + KOKKOS_INLINE_FUNCTION bool operator()(const Coordinates lhs, const Coordinates rhs) const + { + if(lhs.x < rhs.x) + return true; + else if(lhs.x > rhs.x) + return false; + else if(lhs.y < rhs.y) + return true; + else if(lhs.y > rhs.y) + return false; + else if(lhs.z < rhs.z) + return true; + return false; + } +}; + +template +void testBitonicSortLexicographic() +{ + typedef Coordinates Scalar; + //Create a view of randomized data + typedef typename ExecSpace::memory_space mem_space; + typedef Kokkos::View ValView; + size_t n = 9521; + ValView data("Bitonic sort testing data", n); + fillRandom(data); + KokkosKernels::Impl::bitonicSort(data); + int ordered = 1; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), + CheckOrderedFunctor(data), Kokkos::Min(ordered)); + ASSERT_TRUE(ordered); +} + +TEST_F(TestCategory, common_serial_radix) { + //Test serial radix over some contiguous small arrays + //1st arg is #arrays, 2nd arg is max subarray size + size_t numArrays = 100; + for(size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax) + { + testSerialRadixSort(numArrays, arrayMax); + testSerialRadixSort(numArrays, arrayMax); + } +} + +TEST_F(TestCategory, common_serial_radix2) { + //Test serial radix over some contiguous small arrays + //1st arg is #arrays, 2nd arg is max subarray size + size_t numArrays = 100; + for(size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax) + { + testSerialRadixSort2(numArrays, arrayMax); + testSerialRadixSort2(numArrays, arrayMax); + testSerialRadixSort2>(numArrays, arrayMax); + } +} + +TEST_F(TestCategory, common_team_bitonic) { + //Test team-level bitonic over some contiguous medium arrays + //1st arg is #arrays, 2nd arg is max subarray size + size_t numArrays = 20; + for(size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) + { + testTeamBitonicSort(numArrays, arrayMax); + testTeamBitonicSort(numArrays, arrayMax); + } +} + +TEST_F(TestCategory, common_team_bitonic2) { + //Test team-level bitonic over some contiguous medium arrays + //1st arg is #arrays, 2nd arg is max subarray size + size_t numArrays = 20; + for(size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) + { + testTeamBitonicSort2(numArrays, arrayMax); + testTeamBitonicSort2(numArrays, arrayMax); + testTeamBitonicSort2>(numArrays, arrayMax); + } +} + +TEST_F( TestCategory, common_device_bitonic) { + //Test device-level bitonic with some larger arrays + testBitonicSort(243743); + testBitonicSort(2157); + testBitonicSort(424); + testBitonicSort(5); + testBitonicSort(92314); + testBitonicSort(123); + testBitonicSort(60234); + testBitonicSort(53); + //Test custom comparator: ">" instead of "<" to sort descending + testBitonicSortDescending(); + //Test custom comparator: lexicographic comparison of 3-element struct + testBitonicSortLexicographic(); +} + +#endif + diff --git a/packages/kokkos-kernels/unit_test/CMakeLists.txt b/packages/kokkos-kernels/unit_test/CMakeLists.txt index 96b24035f658..55e006b67756 100644 --- a/packages/kokkos-kernels/unit_test/CMakeLists.txt +++ b/packages/kokkos-kernels/unit_test/CMakeLists.txt @@ -72,6 +72,7 @@ IF (Kokkos_ENABLE_Cuda) #currently float 128 test is not working. So common tests are explicitly added. APPEND_GLOB(CUDA_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Common_ArithTraits.cpp) + APPEND_GLOB(CUDA_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Common_Sorting.cpp) TRIBITS_ADD_EXECUTABLE_AND_TEST( @@ -129,6 +130,7 @@ IF (Kokkos_ENABLE_OpenMP) ) APPEND_GLOB(OPENMP_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Common_ArithTraits.cpp) + APPEND_GLOB(OPENMP_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Common_Sorting.cpp) TRIBITS_ADD_EXECUTABLE_AND_TEST( common_openmp @@ -184,6 +186,7 @@ IF (Kokkos_ENABLE_Serial) ) APPEND_GLOB(SERIAL_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Common_ArithTraits.cpp) + APPEND_GLOB(SERIAL_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Common_Sorting.cpp) TRIBITS_ADD_EXECUTABLE_AND_TEST( common_serial @@ -235,6 +238,7 @@ IF (Kokkos_ENABLE_Pthread) APPEND_GLOB(THREADS_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Common_ArithTraits.cpp) + APPEND_GLOB(THREADS_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Common_Sorting.cpp) TRIBITS_ADD_EXECUTABLE_AND_TEST( common_threads diff --git a/packages/kokkos-kernels/unit_test/cuda/Test_Cuda_Common_Sorting.cpp b/packages/kokkos-kernels/unit_test/cuda/Test_Cuda_Common_Sorting.cpp new file mode 100644 index 000000000000..8c145f63eb9a --- /dev/null +++ b/packages/kokkos-kernels/unit_test/cuda/Test_Cuda_Common_Sorting.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/packages/kokkos-kernels/unit_test/openmp/Test_OpenMP_Common_Sorting.cpp b/packages/kokkos-kernels/unit_test/openmp/Test_OpenMP_Common_Sorting.cpp new file mode 100644 index 000000000000..314da395c697 --- /dev/null +++ b/packages/kokkos-kernels/unit_test/openmp/Test_OpenMP_Common_Sorting.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/packages/kokkos-kernels/unit_test/serial/Test_Serial_Common_Sorting.cpp b/packages/kokkos-kernels/unit_test/serial/Test_Serial_Common_Sorting.cpp new file mode 100644 index 000000000000..984919928651 --- /dev/null +++ b/packages/kokkos-kernels/unit_test/serial/Test_Serial_Common_Sorting.cpp @@ -0,0 +1,2 @@ +#include +#include diff --git a/packages/kokkos-kernels/unit_test/threads/Test_Threads_Common_Sorting.cpp b/packages/kokkos-kernels/unit_test/threads/Test_Threads_Common_Sorting.cpp new file mode 100644 index 000000000000..58a12a1d12c7 --- /dev/null +++ b/packages/kokkos-kernels/unit_test/threads/Test_Threads_Common_Sorting.cpp @@ -0,0 +1,2 @@ +#include +#include