diff --git a/packages/kokkos-kernels/src/common/KokkosKernels_Sorting.hpp b/packages/kokkos-kernels/src/common/KokkosKernels_Sorting.hpp
new file mode 100644
index 000000000000..a43d5573dd86
--- /dev/null
+++ b/packages/kokkos-kernels/src/common/KokkosKernels_Sorting.hpp
@@ -0,0 +1,572 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef _KOKKOSKERNELS_SORTING_HPP
+#define _KOKKOSKERNELS_SORTING_HPP
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Atomic.hpp"
+#include "Kokkos_ArithTraits.hpp"
+#include "impl/Kokkos_Timer.hpp"
+#include <type_traits>
+
+namespace KokkosKernels {
+namespace Impl {
+
+//Radix sort for integers, on a single thread within a team.
+//Pros: few diverging branches, so OK for sorting on a single GPU vector lane. Better on CPU cores.
+//Con: requires auxiliary storage, and this version only works for integers
+template<typename Ordinal, typename ValueType>
+KOKKOS_INLINE_FUNCTION void
+SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n)
+{
+  static_assert(std::is_integral<ValueType>::value && std::is_unsigned<ValueType>::value,
+      "radixSort can only be run on unsigned integers.");
+  if(n <= 1)
+    return;
+  ValueType maxVal = 0;
+  for(Ordinal i = 0; i < n; i++)
+  {
+    if(maxVal < values[i])
+      maxVal = values[i];
+  }
+  //determine how many significant bits the data has
+  int passes = 0;
+  while(maxVal)
+  {
+    maxVal >>= 4;
+    passes++;
+  }
+  //Is the data currently held in values (false) or valuesAux (true)?
+  bool inAux = false;
+  //sort 4 bits at a time, into 16 buckets
+  ValueType mask = 0xF;
+  //maskPos counts the low bit index of mask (0, 4, 8, ...)
+  Ordinal maskPos = 0;
+  for(int p = 0; p < passes; p++)
+  {
+    //Count the number of elements in each bucket
+    Ordinal count[16] = {0};
+    Ordinal offset[17];
+    if(!inAux)
+    {
+      for(Ordinal i = 0; i < n; i++)
+      {
+        count[(values[i] & mask) >> maskPos]++;
+      }
+    }
+    else
+    {
+      for(Ordinal i = 0; i < n; i++)
+      {
+        count[(valuesAux[i] & mask) >> maskPos]++;
+      }
+    }
+    offset[0] = 0;
+    //get offset as the prefix sum for count
+    for(Ordinal i = 0; i < 16; i++)
+    {
+      offset[i + 1] = offset[i] + count[i];
+    }
+    //now for each element in [lo, hi), move it to its offset in the other buffer
+    //this branch should be ok because whichBuf is the same on all threads
+    if(!inAux)
+    {
+      for(Ordinal i = 0; i < n; i++)
+      {
+        Ordinal bucket = (values[i] & mask) >> maskPos;
+        valuesAux[offset[bucket + 1] - count[bucket]] = values[i];
+        count[bucket]--;
+      }
+    }
+    else
+    {
+      for(Ordinal i = 0; i < n; i++)
+      {
+        Ordinal bucket = (valuesAux[i] & mask) >> maskPos;
+        values[offset[bucket + 1] - count[bucket]] = valuesAux[i];
+        count[bucket]--;
+      }
+    }
+    inAux = !inAux;
+    mask = mask << 4;
+    maskPos += 4;
+  }
+  //Move values back into main array if they are currently in aux.
+  //This is the case if an odd number of rounds were done.
+  if(inAux)
+  {
+    for(Ordinal i = 0; i < n; i++)
+    {
+      values[i] = valuesAux[i];
+    }
+  }
+}
+
+//Radix sort for integers (no internal parallelism).
+//While sorting, also permute "perm" array along with the values.
+//Pros: few diverging branches, so good for sorting on a single GPU vector lane.
+//Con: requires auxiliary storage, this version only works for integers (although float/double is possible)
+template<typename Ordinal, typename ValueType, typename PermType>
+KOKKOS_INLINE_FUNCTION void
+SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, PermType* permAux, Ordinal n)
+{
+  static_assert(std::is_integral<ValueType>::value && std::is_unsigned<ValueType>::value,
+      "radixSort can only be run on unsigned integers.");
+  if(n <= 1)
+    return;
+  ValueType maxVal = 0;
+  for(Ordinal i = 0; i < n; i++)
+  {
+    if(maxVal < values[i])
+      maxVal = values[i];
+  }
+  int passes = 0;
+  while(maxVal)
+  {
+    maxVal >>= 4;
+    passes++;
+  }
+  //Is the data currently held in values (false) or valuesAux (true)?
+  bool inAux = false;
+  //sort 4 bits at a time, into 16 buckets
+  ValueType mask = 0xF;
+  //maskPos counts the low bit index of mask (0, 4, 8, ...)
+  Ordinal maskPos = 0;
+  for(int p = 0; p < passes; p++)
+  {
+    //Count the number of elements in each bucket
+    Ordinal count[16] = {0};
+    Ordinal offset[17];
+    if(!inAux)
+    {
+      for(Ordinal i = 0; i < n; i++)
+      {
+        count[(values[i] & mask) >> maskPos]++;
+      }
+    }
+    else
+    {
+      for(Ordinal i = 0; i < n; i++)
+      {
+        count[(valuesAux[i] & mask) >> maskPos]++;
+      }
+    }
+    offset[0] = 0;
+    //get offset as the prefix sum for count
+    for(Ordinal i = 0; i < 16; i++)
+    {
+      offset[i + 1] = offset[i] + count[i];
+    }
+    //now for each element in [lo, hi), move it to its offset in the other buffer
+    //this branch should be ok because whichBuf is the same on all threads
+    if(!inAux)
+    {
+      for(Ordinal i = 0; i < n; i++)
+      {
+        Ordinal bucket = (values[i] & mask) >> maskPos;
+        valuesAux[offset[bucket + 1] - count[bucket]] = values[i];
+        permAux[offset[bucket + 1] - count[bucket]] = perm[i];
+        count[bucket]--;
+      }
+    }
+    else
+    {
+      for(Ordinal i = 0; i < n; i++)
+      {
+        Ordinal bucket = (valuesAux[i] & mask) >> maskPos;
+        values[offset[bucket + 1] - count[bucket]] = valuesAux[i];
+        perm[offset[bucket + 1] - count[bucket]] = permAux[i];
+        count[bucket]--;
+      }
+    }
+    inAux = !inAux;
+    mask = mask << 4;
+    maskPos += 4;
+  }
+  //Move values back into main array if they are currently in aux.
+  //This is the case if an odd number of rounds were done.
+  if(inAux)
+  {
+    for(Ordinal i = 0; i < n; i++)
+    {
+      values[i] = valuesAux[i];
+      perm[i] = permAux[i];
+    }
+  }
+}
+
+template<typename Value>
+struct DefaultComparator
+{
+  KOKKOS_INLINE_FUNCTION bool operator()(const Value lhs, const Value rhs) const
+  {
+    return lhs < rhs;
+  }
+};
+
+//Bitonic merge sort (requires only comparison operators and trivially-copyable)
+//Pros: In-place, plenty of parallelism for GPUs, and memory references are coalesced
+//Con: O(n log^2(n)) serial time is bad on CPUs
+//Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter
+template<typename Ordinal, typename ValueType, typename TeamMember, typename Comparator = DefaultComparator<ValueType>>
+KOKKOS_INLINE_FUNCTION void
+TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem)
+{
+  //Algorithm only works on power-of-two input size only.
+  //If n is not a power-of-two, will implicitly pretend
+  //that values[i] for i >= n is just the max for ValueType, so it never gets swapped
+  Ordinal npot = 1;
+  Ordinal levels = 0;
+  while(npot < n)
+  {
+    levels++;
+    npot <<= 1;
+  }
+  for(Ordinal i = 0; i < levels; i++)
+  {
+    for(Ordinal j = 0; j <= i; j++)
+    {
+      // n/2 pairs of items are compared in parallel
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2),
+        [=](const Ordinal t)
+        {
+          //How big are the brown/pink boxes?
+          Ordinal boxSize = Ordinal(2) << (i - j);
+          //Which box contains this thread?
+          Ordinal boxID = t >> (i - j);         //t * 2 / boxSize;
+          Ordinal boxStart = boxID << (1 + i - j);  //boxID * boxSize
+          Ordinal boxOffset = t - (boxStart >> 1);  //t - boxID * boxSize / 2;
+          Ordinal elem1 = boxStart + boxOffset;
+          Comparator comp;
+          if(j == 0)
+          {
+            //first phase (brown box): within a block, compare with the opposite value in the box
+            Ordinal elem2 = boxStart + boxSize - 1 - boxOffset;
+            if(elem2 < n)
+            {
+              //both elements in bounds, so compare them and swap if out of order
+              if(comp(values[elem2], values[elem1]))
+              {
+                ValueType temp = values[elem1];
+                values[elem1] = values[elem2];
+                values[elem2] = temp;
+              }
+            }
+          }
+          else
+          {
+            //later phases (pink box): within a block, compare with fixed distance (boxSize / 2) apart
+            Ordinal elem2 = elem1 + boxSize / 2;
+            if(elem2 < n)
+            {
+              if(comp(values[elem2], values[elem1]))
+              {
+                ValueType temp = values[elem1];
+                values[elem1] = values[elem2];
+                values[elem2] = temp;
+              }
+            }
+          }
+        });
+      mem.team_barrier();
+    }
+  }
+}
+
+//Sort "values", while applying the same swaps to "perm" 
+template<typename Ordinal, typename ValueType, typename PermType, typename TeamMember, typename Comparator = DefaultComparator<ValueType>>
+KOKKOS_INLINE_FUNCTION void
+TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem)
+{
+  //Algorithm only works on power-of-two input size only.
+  //If n is not a power-of-two, will implicitly pretend
+  //that values[i] for i >= n is just the max for ValueType, so it never gets swapped
+  Ordinal npot = 1;
+  Ordinal levels = 0;
+  while(npot < n)
+  {
+    levels++;
+    npot <<= 1;
+  }
+  for(Ordinal i = 0; i < levels; i++)
+  {
+    for(Ordinal j = 0; j <= i; j++)
+    {
+      // n/2 pairs of items are compared in parallel
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2),
+        [=](const Ordinal t)
+        {
+          //How big are the brown/pink boxes?
+          Ordinal boxSize = Ordinal(2) << (i - j);
+          //Which box contains this thread?
+          Ordinal boxID = t >> (i - j);         //t * 2 / boxSize;
+          Ordinal boxStart = boxID << (1 + i - j);  //boxID * boxSize
+          Ordinal boxOffset = t - (boxStart >> 1);  //t - boxID * boxSize / 2;
+          Ordinal elem1 = boxStart + boxOffset;
+          Comparator comp;
+          if(j == 0)
+          {
+            //first phase (brown box): within a block, compare with the opposite value in the box
+            Ordinal elem2 = boxStart + boxSize - 1 - boxOffset;
+            if(elem2 < n)
+            {
+              //both elements in bounds, so compare them and swap if out of order
+              if(comp(values[elem2], values[elem1]))
+              {
+                ValueType temp1 = values[elem1];
+                values[elem1] = values[elem2];
+                values[elem2] = temp1;
+                PermType temp2 = perm[elem1];
+                perm[elem1] = perm[elem2];
+                perm[elem2] = temp2;
+              }
+            }
+          }
+          else
+          {
+            //later phases (pink box): within a block, compare with fixed distance (boxSize / 2) apart
+            Ordinal elem2 = elem1 + boxSize / 2;
+            if(elem2 < n)
+            {
+              if(comp(values[elem2], values[elem1]))
+              {
+                ValueType temp1 = values[elem1];
+                values[elem1] = values[elem2];
+                values[elem2] = temp1;
+                PermType temp2 = perm[elem1];
+                perm[elem1] = perm[elem2];
+                perm[elem2] = temp2;
+              }
+            }
+          }
+        });
+      mem.team_barrier();
+    }
+  }
+}
+
+//Functor that sorts a view on one team
+template<typename View, typename Ordinal, typename TeamMember, typename Comparator>
+struct BitonicSingleTeamFunctor
+{
+  BitonicSingleTeamFunctor(View& v_) : v(v_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const
+  {
+    TeamBitonicSort<Ordinal, typename View::value_type, TeamMember, Comparator>(v.data(), v.extent(0), t);
+  };
+  View v;
+};
+
+//Functor that sorts equally sized chunks on each team
+template<typename View, typename Ordinal, typename TeamMember, typename Comparator>
+struct BitonicChunkFunctor
+{
+  BitonicChunkFunctor(View& v_, Ordinal chunkSize_) : v(v_), chunkSize(chunkSize_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const
+  {
+    Ordinal chunk = t.league_rank();
+    Ordinal chunkStart = chunk * chunkSize;
+    Ordinal n = chunkSize;
+    if(chunkStart + n > Ordinal(v.extent(0)))
+      n = v.extent(0) - chunkStart;
+    TeamBitonicSort<Ordinal, typename View::value_type, TeamMember, Comparator>(v.data() + chunkStart, n, t);
+  };
+  View v;
+  Ordinal chunkSize;
+};
+
+//Functor that does just the first phase (brown) of bitonic sort on equally-sized chunks
+template<typename View, typename Ordinal, typename TeamMember, typename Comparator>
+struct BitonicPhase1Functor
+{
+  typedef typename View::value_type Value;
+  BitonicPhase1Functor(View& v_, Ordinal boxSize_, Ordinal teamsPerBox_)
+    : v(v_), boxSize(boxSize_), teamsPerBox(teamsPerBox_)
+  {}
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const
+  {
+    Ordinal box = t.league_rank() / teamsPerBox;
+    Ordinal boxStart = boxSize * box;
+    Ordinal work = boxSize / teamsPerBox / 2;
+    Ordinal workStart = work * (t.league_rank() % teamsPerBox);
+    Ordinal workReflect = boxSize - workStart - 1;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work),
+      [=](const Ordinal i)
+      {
+        Comparator comp;
+        Ordinal elem1 = boxStart + workStart + i;
+        Ordinal elem2 = boxStart + workReflect - i;
+        if(elem2 < Ordinal(v.extent(0)))
+        {
+          if(comp(v(elem2), v(elem1)))
+          {
+            Value temp = v(elem1);
+            v(elem1) = v(elem2);
+            v(elem2) = temp;
+          }
+        }
+      });
+  };
+  View v;
+  Ordinal boxSize;
+  Ordinal teamsPerBox;
+};
+
+//Functor that does the second phase (red) of bitonic sort
+template<typename View, typename Ordinal, typename TeamMember, typename Comparator>
+struct BitonicPhase2Functor
+{
+  typedef typename View::value_type Value;
+  BitonicPhase2Functor(View& v_, Ordinal boxSize_, Ordinal teamsPerBox_)
+    : v(v_), boxSize(boxSize_), teamsPerBox(teamsPerBox_)
+  {}
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const
+  {
+    Ordinal logBoxSize = 1;
+    while((Ordinal(1) << logBoxSize) < boxSize)
+      logBoxSize++;
+    Ordinal box = t.league_rank() / teamsPerBox;
+    Ordinal boxStart = boxSize * box;
+    Ordinal work = boxSize / teamsPerBox / 2;
+    Ordinal workStart = boxStart + work * (t.league_rank() % teamsPerBox);
+    Ordinal jump = boxSize / 2;
+    Comparator comp;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work),
+      [=](const Ordinal i)
+      {
+        Ordinal elem1 = workStart + i;
+        Ordinal elem2 = workStart + jump + i;
+        if(elem2 < Ordinal(v.extent(0)))
+        {
+          if(comp(v(elem2), v(elem1)))
+          {
+            Value temp = v(elem1);
+            v(elem1) = v(elem2);
+            v(elem2) = temp;
+          }
+        }
+      });
+    if(teamsPerBox == 1)
+    {
+      //This team can finish phase 2 for all the smaller red boxes that follow,
+      //since there are no longer cross-team data dependencies
+      for(Ordinal subLevel = 1; subLevel < logBoxSize; subLevel++)
+      {
+        t.team_barrier();
+        Ordinal logSubBoxSize = logBoxSize - subLevel;
+        Ordinal subBoxSize = Ordinal(1) << logSubBoxSize;
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work),
+          [=](const Ordinal i)
+          {
+            Ordinal globalThread = i + t.league_rank() * work;
+            Ordinal subBox = globalThread >> (logSubBoxSize - 1);
+            Ordinal subBoxStart = subBox << logSubBoxSize;
+            Ordinal subBoxOffset = globalThread & ((Ordinal(1) << (logSubBoxSize - 1)) - 1); //i % (subBoxSize / 2)
+            Ordinal elem1 = subBoxStart + subBoxOffset;
+            //later phases (pink box): within a block, compare with fixed distance (boxSize / 2) apart
+            Ordinal elem2 = elem1 + subBoxSize / 2;
+            if(elem2 < Ordinal(v.extent(0)))
+            {
+              if(comp(v(elem2), v(elem1)))
+              {
+                Value temp = v(elem1);
+                v(elem1) = v(elem2);
+                v(elem2) = temp;
+              }
+            }
+          });
+      }
+    }
+  };
+  View v;
+  Ordinal boxSize;
+  Ordinal teamsPerBox;
+};
+
+//Version to be called from host on a single array
+//Generally ~2x slower than Kokkos::sort() for large arrays (> 50 M elements),
+//but faster for smaller arrays.
+//
+//This is more general than BinSort: bitonic supports any trivially copyable type
+//and an arbitrary device-compatible comparison operator (provided through operator() of Comparator)
+//If comparator is void, use operator< (which should only be used for primitives)
+template<typename View, typename ExecSpace, typename Ordinal, typename Comparator = DefaultComparator<typename View::value_type>>
+void bitonicSort(View v)
+{
+  typedef Kokkos::TeamPolicy<ExecSpace> team_policy;
+  typedef typename team_policy::member_type team_member;
+  Ordinal n = v.extent(0);
+  //If n is small, just sort on a single team
+  if(n <= Ordinal(1) << 16)
+  {
+    Kokkos::parallel_for(team_policy(1, Kokkos::AUTO()),
+        BitonicSingleTeamFunctor<View, Ordinal, team_member, Comparator>(v));
+  }
+  else
+  {
+    Ordinal npot = 1;
+    while(npot < n)
+      npot <<= 1;
+    //Partition the data equally among fixed number of teams
+    Ordinal chunkSize = 512;
+    Ordinal numTeams = npot / chunkSize;
+    //First, sort within teams
+    Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()),
+        BitonicChunkFunctor<View, Ordinal, team_member, Comparator>(v, chunkSize));
+    for(int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2)
+    {
+      Ordinal boxSize = teamsPerBox * chunkSize;
+      Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()),
+          BitonicPhase1Functor<View, Ordinal, team_member, Comparator>(v, boxSize, teamsPerBox));
+      for(int boxDiv = 1; teamsPerBox >> boxDiv; boxDiv++)
+      {
+        Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()),
+            BitonicPhase2Functor<View, Ordinal, team_member, Comparator>(v, boxSize >> boxDiv, teamsPerBox >> boxDiv));
+      }
+    }
+  }
+}
+
+}}
+
+#endif
+
diff --git a/packages/kokkos-kernels/test_common/Test_Common_Sorting.hpp b/packages/kokkos-kernels/test_common/Test_Common_Sorting.hpp
new file mode 100644
index 000000000000..a0b1f78225e9
--- /dev/null
+++ b/packages/kokkos-kernels/test_common/Test_Common_Sorting.hpp
@@ -0,0 +1,603 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Test_Common_Sorting.hpp
+/// \brief Tests for radixSort and bitonicSort in KokkoKernels_Sorting.hpp
+
+#ifndef KOKKOSKERNELS_SORTINGTEST_HPP
+#define KOKKOSKERNELS_SORTINGTEST_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Sort.hpp>
+#include <KokkosKernels_SimpleUtils.hpp>
+#include <KokkosKernels_Utils.hpp>
+#include <KokkosKernels_Sorting.hpp>
+#include <Kokkos_ArithTraits.hpp>
+#include <Kokkos_Complex.hpp>
+#include <cstdlib>
+
+//Generate n randomized counts with mean <avg>.
+//Then prefix-sum into randomOffsets.
+//This simulates a CRS rowmap or other batched sorting scenario
+template<typename OrdView, typename ExecSpace>
+size_t generateRandomOffsets(OrdView randomCounts, OrdView randomOffsets, size_t n, size_t avg)
+{
+  srand(54321);
+  auto countsHost = Kokkos::create_mirror_view(randomCounts);
+  size_t total = 0;
+  for(size_t i = 0; i < n; i++)
+  {
+    if(avg == 0)
+      countsHost(i) = 0;
+    else
+      countsHost(i) = 0.5 + rand() % (avg * 2);
+    total += countsHost(i);
+  }
+  Kokkos::deep_copy(randomCounts, countsHost);
+  Kokkos::deep_copy(randomOffsets, randomCounts);
+  KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum<OrdView, ExecSpace>(n, randomOffsets);
+  return total;
+}
+
+struct Coordinates
+{
+  KOKKOS_INLINE_FUNCTION Coordinates()
+  {
+     x = 0;
+     y = 0;
+     z = 0;
+  }
+  KOKKOS_INLINE_FUNCTION Coordinates(double x_, double y_, double z_)
+  {
+     x = x_;
+     y = y_;
+     z = z_;
+  }
+  double x;
+  double y;
+  double z;
+};
+
+/*
+getRandom generates a random object for sorting.
+the general version just produces integers - below
+are some specializations
+*/
+
+template<typename T>
+T getRandom()
+{
+  return rand() % Kokkos::ArithTraits<T>::max();
+}
+
+//Generate a uniform double between (-5, 5)
+template<>
+double getRandom<double>()
+{
+  return -5 + (10.0 * rand()) / RAND_MAX;
+}
+
+template<>
+Coordinates getRandom<Coordinates>()
+{
+  return Coordinates(getRandom<double>(), getRandom<double>(), getRandom<double>());
+}
+
+//Specialize for Kokkos::complex, with the real and imaginary parts different
+template<typename Key, typename Value>
+struct kvHash
+{
+  Value operator()(const Key& k)
+  {
+    return (Value) (3 * k + 4);
+  }
+};
+
+template<typename Key>
+struct kvHash<Key, Kokkos::complex<double>>
+{
+  Kokkos::complex<double> operator()(const Key& k)
+  {
+    return Kokkos::complex<double>(3 * k + 4, k - 10.4);
+  }
+};
+
+template<typename View>
+void fillRandom(View v)
+{
+  srand(12345);
+  typedef typename View::value_type Value;
+  auto vhost = Kokkos::create_mirror_view(v);
+  for(size_t i = 0; i < v.extent(0); i++)
+    vhost(i) = getRandom<Value>();
+  Kokkos::deep_copy(v, vhost);
+}
+
+template<typename KeyView, typename ValView>
+void fillRandom(KeyView k, ValView v)
+{
+  srand(23456);
+  typedef typename KeyView::value_type Key;
+  typedef typename ValView::value_type Value;
+  auto khost = Kokkos::create_mirror_view(k);
+  auto vhost = Kokkos::create_mirror_view(v);
+  for(size_t i = 0; i < v.extent(0); i++)
+  {
+    khost(i) = getRandom<Key>();
+    vhost(i) = kvHash<Key, Value>()(khost(i));
+  }
+  Kokkos::deep_copy(k, khost);
+  Kokkos::deep_copy(v, vhost);
+}
+
+template<typename KeyView, typename OrdView>
+struct TestSerialRadixFunctor
+{
+  using Key = typename KeyView::value_type;
+  using UnsignedKey = typename std::make_unsigned<Key>::type;
+
+  TestSerialRadixFunctor(KeyView& keys_, KeyView& keysAux_, OrdView& counts_, OrdView& offsets_)
+    : keys(keys_), keysAux(keysAux_), counts(counts_), offsets(offsets_)
+  {}
+  KOKKOS_INLINE_FUNCTION void operator()(const int i) const
+  {
+    int off = offsets(i);
+    KokkosKernels::Impl::SerialRadixSort<int, UnsignedKey>(
+        (UnsignedKey*) keys.data() + off, (UnsignedKey*) keysAux.data() + off, counts(i));
+  }
+  KeyView keys;
+  KeyView keysAux;
+  OrdView counts;
+  OrdView offsets;
+};
+
+template<typename KeyView, typename ValView, typename OrdView>
+struct TestSerialRadix2Functor
+{
+  //Sort by keys, while permuting values
+  using Key = typename KeyView::value_type;
+  using UnsignedKey = typename std::make_unsigned<Key>::type;
+  using Value = typename ValView::value_type;
+
+  TestSerialRadix2Functor(KeyView& keys_, KeyView& keysAux_, ValView& values_, ValView& valuesAux_, OrdView& counts_, OrdView& offsets_)
+    : keys(keys_), keysAux(keysAux_), values(values_), valuesAux(valuesAux_), counts(counts_), offsets(offsets_)
+  {}
+  KOKKOS_INLINE_FUNCTION void operator()(const int i) const
+  {
+    int off = offsets(i);
+    KokkosKernels::Impl::SerialRadixSort2<int, UnsignedKey, Value>(
+        (UnsignedKey*) keys.data() + off, (UnsignedKey*) keysAux.data() + off,
+        values.data() + off, valuesAux.data() + off, counts(i));
+  }
+  KeyView keys;
+  KeyView keysAux;
+  ValView values;
+  ValView valuesAux;
+  OrdView counts;
+  OrdView offsets;
+};
+
+template<typename ExecSpace, typename Key>
+void testSerialRadixSort(size_t k, size_t subArraySize)
+{
+  //Create a view of randomized data
+  typedef typename ExecSpace::memory_space mem_space;
+  typedef Kokkos::View<int*, mem_space> OrdView;
+  typedef Kokkos::View<Key*, mem_space> KeyView;
+  OrdView counts("Subarray Sizes", k);
+  OrdView offsets("Subarray Offsets", k);
+  //Generate k sub-array sizes, each with size about 20
+  size_t n = generateRandomOffsets<OrdView, ExecSpace>(counts, offsets, k, subArraySize);
+  auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
+  auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
+  KeyView keys("Radix sort testing data", n);
+  fillRandom(keys);
+  //Sort using std::sort on host to do correctness test
+  Kokkos::View<Key*, Kokkos::HostSpace> gold("Host sorted", n);
+  Kokkos::deep_copy(gold, keys);
+  for(size_t i = 0; i < k; i++)
+  {
+    Key* begin = gold.data() + offsetsHost(i);
+    Key* end = begin + countsHost(i);
+    std::sort(begin, end);
+  }
+  KeyView keysAux("Radix sort aux data", n);
+  //Run the sorting on device in all sub-arrays in parallel
+  typedef Kokkos::RangePolicy<ExecSpace> range_policy;
+  Kokkos::parallel_for(range_policy(0, k),
+        TestSerialRadixFunctor<KeyView, OrdView>(keys, keysAux, counts, offsets));
+  //Copy result to host
+  auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys);
+  for(size_t i = 0; i < n; i++)
+  {
+    ASSERT_EQ(keysHost(i), gold(i));
+  }
+}
+
+template<typename ExecSpace, typename Key, typename Value>
+void testSerialRadixSort2(size_t k, size_t subArraySize)
+{
+  //Create a view of randomized data
+  typedef typename ExecSpace::memory_space mem_space;
+  typedef Kokkos::View<int*, mem_space> OrdView;
+  typedef Kokkos::View<Key*, mem_space> KeyView;
+  typedef Kokkos::View<Value*, mem_space> ValView;
+  OrdView counts("Subarray Sizes", k);
+  OrdView offsets("Subarray Offsets", k);
+  //Generate k sub-array sizes, each with size about 20
+  size_t n = generateRandomOffsets<OrdView, ExecSpace>(counts, offsets, k, subArraySize);
+  auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
+  auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
+  KeyView keys("Radix test keys", n);
+  ValView data("Radix test data", n);
+  //The keys are randomized
+  fillRandom(keys, data);
+  KeyView keysAux("Radix sort aux keys", n);
+  ValView dataAux("Radix sort aux data", n);
+  //Run the sorting on device in all sub-arrays in parallel
+  typedef Kokkos::RangePolicy<ExecSpace> range_policy;
+  //Deliberately using a weird number for vector length
+  Kokkos::parallel_for(range_policy(0, k),
+        TestSerialRadix2Functor<KeyView, ValView, OrdView>(keys, keysAux, data, dataAux, counts, offsets));
+  //Sort using std::sort on host to do correctness test
+  Kokkos::View<Key*, Kokkos::HostSpace> gold("Host sorted", n);
+  Kokkos::deep_copy(gold, keys);
+  for(size_t i = 0; i < k; i++)
+  {
+    Key* begin = gold.data() + offsetsHost(i);
+    Key* end = begin + countsHost(i);
+    std::sort(begin, end);
+  }
+  //Copy results to host
+  auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys);
+  auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data);
+  //Make sure keys are sorted exactly (stability of sort doesn't matter)
+  for(size_t i = 0; i < n; i++)
+  {
+    ASSERT_EQ(keysHost(i), gold(i));
+  }
+  //Make sure the hashes of each key still matches the corresponding value
+  for(size_t i = 0; i < n; i++)
+  {
+    auto correctHash = kvHash<Key, Value>()(keysHost(i));
+    ASSERT_EQ(dataHost(i), correctHash);
+  }
+}
+
+template<typename ValView, typename OrdView>
+struct TestTeamBitonicFunctor
+{
+  typedef typename ValView::value_type Value;
+
+  TestTeamBitonicFunctor(ValView& values_, OrdView& counts_, OrdView& offsets_)
+    : values(values_), counts(counts_), offsets(offsets_)
+  {}
+
+  template<typename TeamMem>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const
+  {
+    int i = t.league_rank();
+    KokkosKernels::Impl::TeamBitonicSort<int, Value, TeamMem>(values.data() + offsets(i), counts(i), t);
+  }
+
+  ValView values;
+  OrdView counts;
+  OrdView offsets;
+};
+
+template<typename KeyView, typename ValView, typename OrdView>
+struct TestTeamBitonic2Functor
+{
+  typedef typename KeyView::value_type Key;
+  typedef typename ValView::value_type Value;
+
+  TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_, OrdView& offsets_)
+    : keys(keys_), values(values_), counts(counts_), offsets(offsets_)
+  {}
+
+  template<typename TeamMem>
+  KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const
+  {
+    int i = t.league_rank();
+    KokkosKernels::Impl::TeamBitonicSort2<int, Key, Value, TeamMem>(keys.data() + offsets(i), values.data() + offsets(i), counts(i), t);
+  }
+
+  KeyView keys;
+  ValView values;
+  OrdView counts;
+  OrdView offsets;
+};
+
+template<typename ExecSpace, typename Scalar>
+void testTeamBitonicSort(size_t k, size_t subArraySize)
+{
+  //Create a view of randomized data
+  typedef typename ExecSpace::memory_space mem_space;
+  typedef Kokkos::View<int*, mem_space> OrdView;
+  typedef Kokkos::View<Scalar*, mem_space> ValView;
+  OrdView counts("Subarray Sizes", k);
+  OrdView offsets("Subarray Offsets", k);
+  //Generate k sub-array sizes, each with size about 20
+  size_t n = generateRandomOffsets<OrdView, ExecSpace>(counts, offsets, k, subArraySize);
+  auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
+  auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
+  ValView data("Bitonic sort testing data", n);
+  fillRandom(data);
+  //Run the sorting on device in all sub-arrays in parallel
+  Kokkos::parallel_for(Kokkos::TeamPolicy<ExecSpace>(k, Kokkos::AUTO()),
+      TestTeamBitonicFunctor<ValView, OrdView>(data, counts, offsets));
+  //Copy result to host
+  auto dataHost = Kokkos::create_mirror_view(data);
+  Kokkos::deep_copy(dataHost, data);
+  //Sort using std::sort on host to do correctness test
+  Kokkos::View<Scalar*, Kokkos::HostSpace> gold("Host sorted", n);
+  Kokkos::deep_copy(gold, data);
+  for(size_t i = 0; i < k; i++)
+  {
+    Scalar* begin = gold.data() + offsetsHost(i);
+    Scalar* end = begin + countsHost(i);
+    std::sort(begin, end);
+  }
+  for(size_t i = 0; i < n; i++)
+  {
+    ASSERT_EQ(dataHost(i), gold(i));
+  }
+}
+
+template<typename ExecSpace, typename Key, typename Value>
+void testTeamBitonicSort2(size_t k, size_t subArraySize)
+{
+  //Create a view of randomized data
+  typedef typename ExecSpace::memory_space mem_space;
+  typedef Kokkos::View<int*, mem_space> OrdView;
+  typedef Kokkos::View<Key*, mem_space> KeyView;
+  typedef Kokkos::View<Value*, mem_space> ValView;
+  OrdView counts("Subarray Sizes", k);
+  OrdView offsets("Subarray Offsets", k);
+  //Generate k sub-array sizes, each with size about 20
+  size_t n = generateRandomOffsets<OrdView, ExecSpace>(counts, offsets, k, subArraySize);
+  auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts);
+  auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets);
+  KeyView keys("Bitonic test keys", n);
+  ValView data("Bitonic test data", n);
+  //The keys are randomized
+  fillRandom(keys, data);
+  //Run the sorting on device in all sub-arrays in parallel, just using vector loops
+  //Deliberately using a weird number for vector length
+  Kokkos::parallel_for(Kokkos::TeamPolicy<ExecSpace>(k, Kokkos::AUTO()),
+      TestTeamBitonic2Functor<KeyView, ValView, OrdView>(keys, data, counts, offsets));
+  //Sort using std::sort on host to do correctness test
+  Kokkos::View<Key*, Kokkos::HostSpace> gold("Host sorted", n);
+  Kokkos::deep_copy(gold, keys);
+  for(size_t i = 0; i < k; i++)
+  {
+    Key* begin = gold.data() + offsetsHost(i);
+    Key* end = begin + countsHost(i);
+    std::sort(begin, end);
+  }
+  //Copy results to host
+  auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys);
+  auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data);
+  //Make sure keys are sorted exactly (stability of sort doesn't matter)
+  for(size_t i = 0; i < n; i++)
+  {
+    ASSERT_EQ(keysHost(i), gold(i));
+  }
+  //Make sure the hashes of each key still matches the corresponding value
+  for(size_t i = 0; i < n; i++)
+  {
+    auto correctHash = kvHash<Key, Value>()(keysHost(i));
+    ASSERT_EQ(dataHost(i), correctHash);
+  }
+}
+
+template<typename View>
+struct CheckSortedFunctor
+{
+  CheckSortedFunctor(View& v_)
+    : v(v_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(int i, int& lval) const
+  {
+    if(v(i) > v(i + 1))
+      lval = 0;
+  }
+  View v;
+};
+
+template<typename ExecSpace, typename Scalar>
+void testBitonicSort(size_t n)
+{
+  //Create a view of randomized data
+  typedef typename ExecSpace::memory_space mem_space;
+  typedef Kokkos::View<Scalar*, mem_space> ValView;
+  ValView data("Bitonic sort testing data", n);
+  fillRandom(data);
+  KokkosKernels::Impl::bitonicSort<ValView, ExecSpace, int>(data);
+  int ordered = 1;
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, n - 1),
+      CheckSortedFunctor<ValView>(data), Kokkos::Min<int>(ordered));
+  ASSERT_TRUE(ordered);
+}
+
+//Check that the view is weakly ordered according to Comparator:
+//Comparator never says that element i+1 belongs before element i.
+template<typename View, typename Comparator>
+struct CheckOrderedFunctor
+{
+  CheckOrderedFunctor(View& v_)
+    : v(v_) {}
+  KOKKOS_INLINE_FUNCTION void operator()(int i, int& lval) const
+  {
+    Comparator comp;
+    if(comp(v(i + 1), v(i)))
+      lval = 0;
+  }
+  View v;
+};
+
+template<typename Scalar>
+struct CompareDescending
+{
+  KOKKOS_INLINE_FUNCTION bool operator()(const Scalar lhs, const Scalar rhs) const
+  {
+    return lhs > rhs;
+  }
+};
+
+template<typename ExecSpace>
+void testBitonicSortDescending()
+{
+  typedef char Scalar;
+  typedef CompareDescending<Scalar> Comp;
+  //Create a view of randomized data
+  typedef typename ExecSpace::memory_space mem_space;
+  typedef Kokkos::View<Scalar*, mem_space> ValView;
+  size_t n = 12521;
+  ValView data("Bitonic sort testing data", n);
+  fillRandom(data);
+  KokkosKernels::Impl::bitonicSort<ValView, ExecSpace, int, Comp>(data);
+  int ordered = 1;
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, n - 1),
+      CheckOrderedFunctor<ValView, Comp>(data), Kokkos::Min<int>(ordered));
+  ASSERT_TRUE(ordered);
+}
+
+struct LexCompare
+{
+  KOKKOS_INLINE_FUNCTION bool operator()(const Coordinates lhs, const Coordinates rhs) const
+  {
+    if(lhs.x < rhs.x)
+      return true;
+    else if(lhs.x > rhs.x)
+      return false;
+    else if(lhs.y < rhs.y)
+      return true;
+    else if(lhs.y > rhs.y)
+      return false;
+    else if(lhs.z < rhs.z)
+      return true;
+    return false;
+  }
+};
+
+template<typename ExecSpace>
+void testBitonicSortLexicographic()
+{
+  typedef Coordinates Scalar;
+  //Create a view of randomized data
+  typedef typename ExecSpace::memory_space mem_space;
+  typedef Kokkos::View<Scalar*, mem_space> ValView;
+  size_t n = 9521;
+  ValView data("Bitonic sort testing data", n);
+  fillRandom(data);
+  KokkosKernels::Impl::bitonicSort<ValView, ExecSpace, int, LexCompare>(data);
+  int ordered = 1;
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0, n - 1),
+      CheckOrderedFunctor<ValView, LexCompare>(data), Kokkos::Min<int>(ordered));
+  ASSERT_TRUE(ordered);
+}
+
+TEST_F(TestCategory, common_serial_radix) {
+  //Test serial radix over some contiguous small arrays
+  //1st arg is #arrays, 2nd arg is max subarray size
+  size_t numArrays = 100;
+  for(size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax)
+  {
+    testSerialRadixSort<TestExecSpace, char>(numArrays, arrayMax);
+    testSerialRadixSort<TestExecSpace, int>(numArrays, arrayMax);
+  }
+}
+
+TEST_F(TestCategory, common_serial_radix2) {
+  //Test serial radix over some contiguous small arrays
+  //1st arg is #arrays, 2nd arg is max subarray size
+  size_t numArrays = 100;
+  for(size_t arrayMax = 0; arrayMax < 1000; arrayMax = 1 + 4 * arrayMax)
+  {
+    testSerialRadixSort2<TestExecSpace, char, int>(numArrays, arrayMax);
+    testSerialRadixSort2<TestExecSpace, int, double>(numArrays, arrayMax);
+    testSerialRadixSort2<TestExecSpace, int, Kokkos::complex<double>>(numArrays, arrayMax);
+  }
+}
+
+TEST_F(TestCategory, common_team_bitonic) {
+  //Test team-level bitonic over some contiguous medium arrays
+  //1st arg is #arrays, 2nd arg is max subarray size
+  size_t numArrays = 20;
+  for(size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax)
+  {
+    testTeamBitonicSort<TestExecSpace, char>(numArrays, arrayMax);
+    testTeamBitonicSort<TestExecSpace, int>(numArrays, arrayMax);
+  }
+}
+
+TEST_F(TestCategory, common_team_bitonic2) {
+  //Test team-level bitonic over some contiguous medium arrays
+  //1st arg is #arrays, 2nd arg is max subarray size
+  size_t numArrays = 20;
+  for(size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax)
+  {
+    testTeamBitonicSort2<TestExecSpace, char, int>(numArrays, arrayMax);
+    testTeamBitonicSort2<TestExecSpace, int, double>(numArrays, arrayMax);
+    testTeamBitonicSort2<TestExecSpace, int, Kokkos::complex<double>>(numArrays, arrayMax);
+  }
+}
+
+TEST_F( TestCategory, common_device_bitonic) {
+  //Test device-level bitonic with some larger arrays
+  testBitonicSort<TestExecSpace, char>(243743);
+  testBitonicSort<TestExecSpace, char>(2157);
+  testBitonicSort<TestExecSpace, char>(424);
+  testBitonicSort<TestExecSpace, char>(5);
+  testBitonicSort<TestExecSpace, int>(92314);
+  testBitonicSort<TestExecSpace, int>(123);
+  testBitonicSort<TestExecSpace, double>(60234);
+  testBitonicSort<TestExecSpace, double>(53);
+  //Test custom comparator: ">" instead of "<" to sort descending
+  testBitonicSortDescending<TestExecSpace>();
+  //Test custom comparator: lexicographic comparison of 3-element struct
+  testBitonicSortLexicographic<TestExecSpace>();
+}
+
+#endif
+
diff --git a/packages/kokkos-kernels/unit_test/CMakeLists.txt b/packages/kokkos-kernels/unit_test/CMakeLists.txt
index 96b24035f658..55e006b67756 100644
--- a/packages/kokkos-kernels/unit_test/CMakeLists.txt
+++ b/packages/kokkos-kernels/unit_test/CMakeLists.txt
@@ -72,6 +72,7 @@ IF (Kokkos_ENABLE_Cuda)
   
   #currently float 128 test is not working. So common tests are explicitly added.  
   APPEND_GLOB(CUDA_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Common_ArithTraits.cpp)
+  APPEND_GLOB(CUDA_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/cuda/Test_Cuda_Common_Sorting.cpp)
   
 
   TRIBITS_ADD_EXECUTABLE_AND_TEST(
@@ -129,6 +130,7 @@ IF (Kokkos_ENABLE_OpenMP)
   )
   
   APPEND_GLOB(OPENMP_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Common_ArithTraits.cpp)
+  APPEND_GLOB(OPENMP_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/openmp/Test_OpenMP_Common_Sorting.cpp)
 
   TRIBITS_ADD_EXECUTABLE_AND_TEST(
     common_openmp
@@ -184,6 +186,7 @@ IF (Kokkos_ENABLE_Serial)
   )
   
   APPEND_GLOB(SERIAL_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Common_ArithTraits.cpp)
+  APPEND_GLOB(SERIAL_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/serial/Test_Serial_Common_Sorting.cpp)
 
   TRIBITS_ADD_EXECUTABLE_AND_TEST(
     common_serial
@@ -235,6 +238,7 @@ IF (Kokkos_ENABLE_Pthread)
   
   
   APPEND_GLOB(THREADS_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Common_ArithTraits.cpp)
+  APPEND_GLOB(THREADS_COMMON_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/threads/Test_Threads_Common_Sorting.cpp)
 
   TRIBITS_ADD_EXECUTABLE_AND_TEST(
     common_threads
diff --git a/packages/kokkos-kernels/unit_test/cuda/Test_Cuda_Common_Sorting.cpp b/packages/kokkos-kernels/unit_test/cuda/Test_Cuda_Common_Sorting.cpp
new file mode 100644
index 000000000000..8c145f63eb9a
--- /dev/null
+++ b/packages/kokkos-kernels/unit_test/cuda/Test_Cuda_Common_Sorting.cpp
@@ -0,0 +1,2 @@
+#include<Test_Cuda.hpp>
+#include<Test_Common_Sorting.hpp>
diff --git a/packages/kokkos-kernels/unit_test/openmp/Test_OpenMP_Common_Sorting.cpp b/packages/kokkos-kernels/unit_test/openmp/Test_OpenMP_Common_Sorting.cpp
new file mode 100644
index 000000000000..314da395c697
--- /dev/null
+++ b/packages/kokkos-kernels/unit_test/openmp/Test_OpenMP_Common_Sorting.cpp
@@ -0,0 +1,2 @@
+#include<Test_OpenMP.hpp>
+#include<Test_Common_Sorting.hpp>
diff --git a/packages/kokkos-kernels/unit_test/serial/Test_Serial_Common_Sorting.cpp b/packages/kokkos-kernels/unit_test/serial/Test_Serial_Common_Sorting.cpp
new file mode 100644
index 000000000000..984919928651
--- /dev/null
+++ b/packages/kokkos-kernels/unit_test/serial/Test_Serial_Common_Sorting.cpp
@@ -0,0 +1,2 @@
+#include<Test_Serial.hpp>
+#include<Test_Common_Sorting.hpp>
diff --git a/packages/kokkos-kernels/unit_test/threads/Test_Threads_Common_Sorting.cpp b/packages/kokkos-kernels/unit_test/threads/Test_Threads_Common_Sorting.cpp
new file mode 100644
index 000000000000..58a12a1d12c7
--- /dev/null
+++ b/packages/kokkos-kernels/unit_test/threads/Test_Threads_Common_Sorting.cpp
@@ -0,0 +1,2 @@
+#include<Test_Threads.hpp>
+#include<Test_Common_Sorting.hpp>