From e8112f7d14fe547095bad2b28184fc130345fa55 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Tue, 6 Oct 2020 11:49:39 -0600
Subject: [PATCH 01/18] WIP: adding HIP codepaths in preparation for tests/ETI

---
 perf_test/graph/KokkosGraph_color.cpp         |    9 +
 perf_test/graph/KokkosGraph_color_d2.cpp      |   23 +-
 perf_test/graph/KokkosGraph_mis_d2.cpp        |   18 +-
 perf_test/graph/KokkosGraph_triangle.cpp      |   19 +-
 perf_test/sparse/KokkosSparse_pcg.cpp         |  291 ++---
 perf_test/sparse/KokkosSparse_spadd.cpp       |    2 +-
 perf_test/sparse/KokkosSparse_spgemm.cpp      |   17 +-
 src/Kokkos_ArithTraits.hpp                    |   11 +-
 .../KokkosBatched_Gemm_Team_Internal.hpp      |   20 +-
 .../KokkosBatched_Trsm_Team_Internal.hpp      |   20 +-
 .../KokkosBatched_Trsv_Serial_Internal.hpp    |    2 +-
 .../KokkosBatched_Trsv_Team_Internal.hpp      |    2 +-
 src/batched/KokkosBatched_Util.hpp            |    2 +-
 src/batched/KokkosBatched_Vector.hpp          |   38 +
 src/batched/KokkosBatched_Vector_SIMD.hpp     |   10 +-
 .../KokkosBatched_Vector_SIMD_Arith.hpp       |    8 +-
 src/blas/impl/KokkosBlas2_gemv_impl.hpp       |    4 +-
 src/blas/impl/KokkosBlas3_gemm_impl.hpp       |    7 +
 src/blas/impl/KokkosBlas3_gemm_spec.hpp       |    4 +
 src/common/KokkosKernels_BitUtils.hpp         |    1 +
 src/common/KokkosKernels_ExecSpaceUtils.hpp   |   86 +-
 src/common/KokkosKernels_Handle.hpp           |    2 +-
 src/common/KokkosKernels_Macros.hpp           |    4 +-
 src/common/KokkosKernels_SparseUtils.hpp      |  168 +--
 ...Kernels_Uniform_Initialized_MemoryPool.hpp |    3 +-
 src/common/KokkosKernels_Utils.hpp            |   67 +-
 src/common/KokkosKernels_default_types.hpp    |    2 +
 .../KokkosGraph_Distance1ColorHandle.hpp      |   70 +-
 .../KokkosGraph_Distance2ColorHandle.hpp      |   62 +-
 .../impl/KokkosGraph_Distance2MIS_impl.hpp    |    2 +-
 src/sparse/KokkosSparse_CrsMatrix.hpp         |    6 +
 .../KokkosSparse_gauss_seidel_handle.hpp      |   74 +-
 src/sparse/KokkosSparse_spadd.hpp             |   61 -
 src/sparse/KokkosSparse_spgemm_handle.hpp     |   76 +-
 .../impl/KokkosSparse_gauss_seidel_impl.hpp   |   47 +-
 .../impl/KokkosSparse_partitioning_impl.hpp   |  529 ---------
 .../KokkosSparse_spgemm_impl_compression.hpp  |   53 +-
 .../impl/KokkosSparse_spgemm_impl_def.hpp     |    5 +-
 .../impl/KokkosSparse_spgemm_impl_kkmem.hpp   |   26 +-
 .../impl/KokkosSparse_spgemm_impl_speed.hpp   |    8 +-
 .../KokkosSparse_spgemm_impl_symbolic.hpp     |   86 +-
 .../KokkosSparse_spgemm_impl_triangle.hpp     |   48 +-
 ...se_spgemm_impl_triangle_no_compression.hpp |   46 +-
 ...kosSparse_spgemm_jacobi_sparseacc_impl.hpp |   26 +-
 src/sparse/impl/KokkosSparse_spmv_impl.hpp    |  451 +++-----
 .../impl/KokkosSparse_spmv_struct_impl.hpp    | 1005 ++++++++---------
 .../impl/KokkosSparse_sptrsv_solve_impl.hpp   |   17 +
 test_common/KokkosKernels_TestParameters.hpp  |    2 +
 48 files changed, 1312 insertions(+), 2228 deletions(-)

diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp
index cbc3697517..f7d8a93e80 100644
--- a/perf_test/graph/KokkosGraph_color.cpp
+++ b/perf_test/graph/KokkosGraph_color.cpp
@@ -579,6 +579,15 @@ int main (int argc, char ** argv){
 
 #endif
 
+#if defined( KOKKOS_ENABLE_HIP )
+  if (params.use_hip) {
+    KokkosKernels::Experiment::run_multi_mem_experiment
+    <size_type, idx, Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(
+        params
+        );
+  }
+#endif
+
 #if defined( KOKKOS_ENABLE_SERIAL )
   if (params.use_serial) {
 #ifdef KOKKOSKERNELS_MULTI_MEM
diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp
index 970bafa380..04d977527d 100644
--- a/perf_test/graph/KokkosGraph_color_d2.cpp
+++ b/perf_test/graph/KokkosGraph_color_d2.cpp
@@ -81,6 +81,7 @@ struct D2Parameters
   int use_threads;
   int use_openmp;
   int use_cuda;
+  int use_hip;
   int use_serial;
   const char* mtx_file;
   ColoringMode d2_color_type;
@@ -93,6 +94,7 @@ struct D2Parameters
     use_threads = 0;
     use_openmp = 0;
     use_cuda = 0;
+    use_hip = 0;
     use_serial = 0;
     mtx_file = NULL;
     d2_color_type = MODE_D2_SYMMETRIC;
@@ -147,6 +149,9 @@ void print_options(std::ostream &os, const char *app_name, unsigned int indent =
 #endif
 #ifdef KOKKOS_ENABLE_CUDA
        << spaces << "          --cuda <device id>  Use given CUDA device" << std::endl
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+       << spaces << "          --hip <device id>  Use given HIP device" << std::endl
 #endif
        << std::endl
        << spaces << "  Coloring modes:" << std::endl
@@ -199,6 +204,10 @@ int parse_inputs(D2Parameters &params, int argc, char **argv)
         {
             params.use_cuda = 1 + atoi(getNextArg(i, argc, argv));
         }
+        else if(0 == strcasecmp(argv[i], "--hip"))
+        {
+            params.use_hip = 1 + atoi(getNextArg(i, argc, argv));
+        }
         else if(0 == strcasecmp(argv[i], "--repeat"))
         {
             params.repeat = atoi(getNextArg(i, argc, argv));
@@ -273,7 +282,7 @@ int parse_inputs(D2Parameters &params, int argc, char **argv)
         print_options(std::cout, argv[0]);
         return 1;
     }
-    if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda)
+    if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip)
     {
         print_options(std::cout, argv[0]);
         return 1;
@@ -603,6 +612,8 @@ int main(int argc, char *argv[])
     int device_id = 0;
     if(params.use_cuda)
       device_id = params.use_cuda - 1;
+    else if(params.use_hip)
+      device_id = params.use_hip - 1;
     Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id));
 
     // Print out verbose information about the configuration of the run.
@@ -645,6 +656,16 @@ int main(int argc, char *argv[])
     }
     #endif
 
+    #if defined(KOKKOS_ENABLE_HIP)
+    if(params.use_hip)
+    {
+        if(!use_multi_mem)
+        {
+            KokkosKernels::Experiment::experiment_driver<kk_size_type, kk_lno_t, Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace>(params);
+        }
+    }
+    #endif
+
     #if defined(KOKKOS_ENABLE_SERIAL)
     if(params.use_serial)
     {
diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp
index da9fb549d6..32ff5f5fbd 100644
--- a/perf_test/graph/KokkosGraph_mis_d2.cpp
+++ b/perf_test/graph/KokkosGraph_mis_d2.cpp
@@ -75,6 +75,7 @@ struct MIS2Parameters
   int use_threads = 0;
   int use_openmp = 0;
   int use_cuda = 0;
+  int use_hip = 0;
   int use_serial = 0;
   const char* mtx_file = NULL;
   MIS2_Algorithm algo = MIS2_FAST;
@@ -163,6 +164,9 @@ void print_options(std::ostream &os, const char *app_name, unsigned int indent =
 #endif
 #ifdef KOKKOS_ENABLE_CUDA
        << spaces << "          --cuda              Use CUDA.\n"
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+       << spaces << "          --hip               Use HIP.\n"
 #endif
        << std::endl
        << spaces << "  Optional Parameters:" << std::endl
@@ -205,6 +209,10 @@ int parse_inputs(MIS2Parameters &params, int argc, char **argv)
         {
             params.use_cuda = 1;
         }
+        else if(0 == strcasecmp(argv[i], "--hip"))
+        {
+            params.use_hip = 1;
+        }
         else if(0 == strcasecmp(argv[i], "--repeat"))
         {
             params.repeat = atoi(getNextArg(i, argc, argv));
@@ -252,7 +260,7 @@ int parse_inputs(MIS2Parameters &params, int argc, char **argv)
         print_options(std::cout, argv[0]);
         return 1;
     }
-    if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda)
+    if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip)
     {
         print_options(std::cout, argv[0]);
         return 1;
@@ -362,6 +370,14 @@ int main(int argc, char *argv[])
     }
     #endif
 
+    #if defined(KOKKOS_ENABLE_HIP)
+    if(params.use_hip)
+    {
+      run_mis2<Kokkos::Experimental::HIP>(params);
+      run = true;
+    }
+    #endif
+
     #if defined(KOKKOS_ENABLE_SERIAL)
     if(params.use_serial)
     {
diff --git a/perf_test/graph/KokkosGraph_triangle.cpp b/perf_test/graph/KokkosGraph_triangle.cpp
index 6f0b6c73df..63a52dbaea 100644
--- a/perf_test/graph/KokkosGraph_triangle.cpp
+++ b/perf_test/graph/KokkosGraph_triangle.cpp
@@ -54,7 +54,7 @@
 
 void print_options(){
   std::cerr << "Options\n" << std::endl;
-  std::cerr << "Choose BackEnd                     : --openmp [numthreads] | --cuda" << std::endl;
+  std::cerr << "Choose BackEnd                     : --openmp [numthreads] | --cuda | --hip" << std::endl;
   std::cerr << "Input Matrix                       : --amtx [path_to_input_matrix]" << std::endl;
   std::cerr << "\tInput Matrix format can be multiple formats. If it ends with:" << std::endl;
   std::cerr << "\t\t.mtx: it will read matrix market format." << std::endl;
@@ -96,6 +96,9 @@ int parse_inputs (KokkosKernels::Experiment::Parameters &params, int argc, char
     else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
       params.use_cuda = 1;
     }
+    else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) {
+      params.use_hip = 1;
+    }
     else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) {
       params.repeat = atoi( argv[++i] );
     }
@@ -292,7 +295,6 @@ int main (int argc, char ** argv){
   const int device_id = 0;
   Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) );
 
-#if !defined (KOKKOS_ENABLE_CUDA)
 #if defined( KOKKOS_ENABLE_OPENMP )
 
   if (params.use_openmp) {
@@ -311,10 +313,9 @@ int main (int argc, char ** argv){
   }
 
 #endif
-#endif
 
 
-#if defined( KOKKOS_ENABLE_CUDA1 )
+#if defined( KOKKOS_ENABLE_CUDA )
   if (params.use_cuda) {
     Kokkos::Cuda::print_configuration(std::cout);
 #ifdef KOKKOSKERNELS_MULTI_MEM
@@ -332,6 +333,16 @@ int main (int argc, char ** argv){
 
 #endif
 
+#if defined( KOKKOS_ENABLE_HIP )
+  if (params.use_hip) {
+    Kokkos::Experimental::HIP::print_configuration(std::cout);
+    KokkosKernels::Experiment::run_multi_mem_triangle
+    <size_type, idx, Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(
+        params
+        );
+  }
+#endif 
+
   Kokkos::finalize();
 
   return 0;
diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp
index 681327dfaf..0f6351189b 100644
--- a/perf_test/sparse/KokkosSparse_pcg.cpp
+++ b/perf_test/sparse/KokkosSparse_pcg.cpp
@@ -43,32 +43,24 @@
 */
 
 #include <KokkosKernels_config.h>
-#if defined(KOKKOSKERNELS_INST_DOUBLE) &&  \
-    defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \
-    defined(KOKKOSKERNELS_INST_ORDINAL_INT)
 #include "KokkosSparse_pcg.hpp"
 
 #include "KokkosKernels_Utils.hpp"
-#include <iostream>
 #include "KokkosKernels_IOUtils.hpp"
+#include "KokkosKernels_default_types.hpp"
+#include <iostream>
 
 #define MAXVAL 1
 
-#define SIZE_TYPE size_t
-#define INDEX_TYPE int
-#define SCALAR_TYPE double
-
-
-
 template<typename scalar_view_t>
-scalar_view_t create_x_vector(INDEX_TYPE nv, SCALAR_TYPE max_value = 1.0){
+scalar_view_t create_x_vector(default_lno_t nv, default_scalar max_value = 1.0){
   scalar_view_t kok_x ("X", nv);
 
   typename scalar_view_t::HostMirror h_x =  Kokkos::create_mirror_view (kok_x);
 
 
-  for (INDEX_TYPE i = 0; i < nv; ++i){
-    SCALAR_TYPE r = static_cast <SCALAR_TYPE> (rand()) / static_cast <SCALAR_TYPE> (RAND_MAX / max_value);
+  for (default_lno_t i = 0; i < nv; ++i){
+    default_scalar r = static_cast <default_scalar> (rand()) / static_cast <default_scalar> (RAND_MAX / max_value);
     h_x(i) = r;
   }
   Kokkos::deep_copy (kok_x, h_x);
@@ -98,7 +90,7 @@ void run_experiment(
   typedef typename lno_view_t::value_type size_type;
   typedef typename scalar_view_t::value_type scalar_t;
 
-  INDEX_TYPE nv = crsmat.numRows();
+  default_lno_t nv = crsmat.numRows();
   scalar_view_t kok_x_original = create_x_vector<scalar_view_t>(nv, MAXVAL);
   scalar_view_t kok_b_vector = create_y_vector(crsmat, kok_x_original);
 
@@ -255,25 +247,70 @@ void run_experiment(
   */
 }
 
-
-
-
 enum { CMD_USE_THREADS = 0
      , CMD_USE_NUMA
      , CMD_USE_CORE_PER_NUMA
      , CMD_USE_CUDA
+     , CMD_USE_HIP
      , CMD_USE_OPENMP
-     , CMD_USE_CUDA_DEV
+     , CMD_DEVICE
      , CMD_BIN_MTX
      , CMD_CLUSTER_SIZE
      , CMD_USE_SEQUENTIAL_SGS
      , CMD_ERROR
      , CMD_COUNT };
 
+template<typename execution_space>
+void run_pcg(int* cmdline, const char* mtx_file)
+{
+  default_lno_t nv = 0, ne = 0;
+  default_lno_t *xadj, *adj;
+  default_scalar *ew;
+
+  KokkosKernels::Impl::read_matrix<default_lno_t,default_lno_t, default_scalar> (&nv, &ne, &xadj, &adj, &ew, mtx_file);
+
+  typedef typename KokkosSparse::CrsMatrix<default_scalar, default_lno_t, execution_space, void, default_size_type> crsMat_t;
+
+  typedef typename crsMat_t::StaticCrsGraphType graph_t;
+  typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t;
+  typedef typename crsMat_t::index_type::non_const_type   cols_view_t;
+  typedef typename crsMat_t::values_type::non_const_type values_view_t;
+
+  row_map_view_t rowmap_view("rowmap_view", nv+1);
+  cols_view_t columns_view("colsmap_view", ne);
+  values_view_t values_view("values_view", ne);
+
+  {
+    typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view);
+    typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view);
+    typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view);
+
+    for (default_lno_t i = 0; i <= nv; ++i){
+      hr(i) = xadj[i];
+    }
+
+    for (default_lno_t i = 0; i < ne; ++i){
+      hc(i) = adj[i];
+      hv(i) = ew[i];
+    }
+    Kokkos::deep_copy (rowmap_view , hr);
+    Kokkos::deep_copy (columns_view , hc);
+    Kokkos::deep_copy (values_view , hv);
+  }
+  graph_t static_graph (columns_view, rowmap_view);
+  crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph);
+
+  delete [] xadj;
+  delete [] adj;
+  delete [] ew;
+
+  run_experiment<execution_space, crsMat_t>(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]);
+}
+
 int main (int argc, char ** argv){
 
   int cmdline[ CMD_COUNT ] ;
-  char *mtx_bin_file = NULL;
+  char *mtx_file = NULL;
   for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ;
 
   for ( int i = 1 ; i < argc ; ++i ) {
@@ -283,17 +320,22 @@ int main (int argc, char ** argv){
     else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) {
       cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] );
     }
+    /*
     else if ( 0 == strcasecmp( argv[i] , "--cores" ) ) {
+      //Note BMK: specifying #NUMA regions isn't supported by initialize
       sscanf( argv[++i] , "%dx%d" ,
               cmdline + CMD_USE_NUMA ,
               cmdline + CMD_USE_CORE_PER_NUMA );
     }
+    */
     else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
       cmdline[ CMD_USE_CUDA ] = 1 ;
     }
-    else if ( 0 == strcasecmp( argv[i] , "--cuda-dev" ) ) {
-      cmdline[ CMD_USE_CUDA ] = 1 ;
-      cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ;
+    else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) {
+      cmdline[ CMD_USE_HIP ] = 1 ;
+    }
+    else if ( 0 == strcasecmp( argv[i] , "--device-id" ) ) {
+      cmdline[ CMD_DEVICE ] = atoi( argv[++i] ) ;
     }
     else if ( 0 == strcasecmp( argv[i] , "--cluster-size" ) ) {
       cmdline[CMD_CLUSTER_SIZE] = atoi(argv[++i]);
@@ -303,12 +345,12 @@ int main (int argc, char ** argv){
     }
 
     else if ( 0 == strcasecmp( argv[i] , "--mtx" ) ) {
-      mtx_bin_file = argv[++i];
+      mtx_file = argv[++i];
     }
     else {
       cmdline[ CMD_ERROR ] = 1 ;
       std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
-      std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl;
+      std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--hip\n\t--device-id[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl;
 
       return 0;
     }
@@ -317,190 +359,43 @@ int main (int argc, char ** argv){
   if(cmdline[CMD_CLUSTER_SIZE] == 0)
     cmdline[CMD_CLUSTER_SIZE] = 1;
 
-  if (mtx_bin_file == NULL){
-    std::cerr << "Provide a mtx binary file" << std::endl ;
-    std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl;
+  if (mtx_file == NULL){
+    std::cerr << "Provide a matrix file" << std::endl ;
+    std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--hip\n\t--device-id[DeviceIndex]\n\t--mtx[matrix]" << std::endl;
 
     return 0;
   }
 
+  Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space
 
-#if defined( KOKKOS_ENABLE_THREADS )
-
-    if ( cmdline[ CMD_USE_THREADS ] ) {
-
-      Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space
-
-      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
-        init_args.num_threads = cmdline[ CMD_USE_THREADS ];
-        init_args.num_numa = cmdline[ CMD_USE_NUMA ];
-        //const int core_per_numa = cmdline[ CMD_USE_CORE_PER_NUMA ]; // How to get this to initialize() without using impl_initialize()?
-      }
-      else {
-        init_args.num_threads = cmdline[ CMD_USE_THREADS ];
-      }
-
-      Kokkos::initialize( init_args );
-      Kokkos::print_configuration(std::cout);
-      {
-        INDEX_TYPE nv = 0, ne = 0;
-        INDEX_TYPE *xadj, *adj;
-        SCALAR_TYPE *ew;
-
-        KokkosKernels::Impl::read_matrix<INDEX_TYPE,INDEX_TYPE, SCALAR_TYPE> (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file);
-
-        typedef Kokkos::Threads myExecSpace;
-        typedef typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace, void, SIZE_TYPE > crsMat_t;
-
-        typedef typename crsMat_t::StaticCrsGraphType graph_t;
-        typedef typename graph_t::row_map_type::non_const_type row_map_view_t;
-        typedef typename graph_t::entries_type::non_const_type   cols_view_t;
-        typedef typename crsMat_t::values_type::non_const_type values_view_t;
-
-        row_map_view_t rowmap_view("rowmap_view", nv+1);
-        cols_view_t columns_view("colsmap_view", ne);
-        values_view_t values_view("values_view", ne);
-
-        KokkosKernels::Impl::copy_vector<SCALAR_TYPE * , values_view_t, myExecSpace>(ne, ew, values_view);
-        KokkosKernels::Impl::copy_vector<INDEX_TYPE * , cols_view_t, myExecSpace>(ne, adj, columns_view);
-        KokkosKernels::Impl::copy_vector<INDEX_TYPE * , row_map_view_t, myExecSpace>(nv+1, xadj, rowmap_view);
-
-        graph_t static_graph (columns_view, rowmap_view);
-        crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph);
-        delete [] xadj;
-        delete [] adj;
-        delete [] ew;
-
-        run_experiment<myExecSpace, crsMat_t>(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]);
-      }
+  init_args.device_id = cmdline[ CMD_DEVICE ];
+  if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
+    init_args.num_threads = std::max(cmdline[ CMD_USE_THREADS ], cmdline [ CMD_USE_OPENMP ]);
+    init_args.num_numa = cmdline[ CMD_USE_NUMA ];
+  }
+  else {
+    init_args.num_threads = cmdline[ CMD_USE_THREADS ];
+  }
 
-      Kokkos::finalize();
-    }
+  Kokkos::initialize( init_args );
+  {
+#if defined( KOKKOS_ENABLE_THREADS )
+    if(cmdline[CMD_USE_THREADS])
+      run_pcg<Kokkos::Threads>(cmdline, mtx_file);
 #endif
-
 #if defined( KOKKOS_ENABLE_OPENMP )
-
-    if ( cmdline[ CMD_USE_OPENMP ] ) {
-
-      Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space
-
-      if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
-        init_args.num_threads = cmdline[ CMD_USE_OPENMP ];
-        init_args.num_numa = cmdline[ CMD_USE_NUMA ];
-        //const int core_per_numa = cmdline[ CMD_USE_CORE_PER_NUMA ];
-      }
-      else {
-        init_args.num_threads = cmdline[ CMD_USE_OPENMP ];
-      }
-
-      Kokkos::initialize( init_args );
-      Kokkos::print_configuration(std::cout);
-      {
-        INDEX_TYPE nv = 0, ne = 0;
-        INDEX_TYPE *xadj, *adj;
-        SCALAR_TYPE *ew;
-
-        KokkosKernels::Impl::read_matrix<INDEX_TYPE,INDEX_TYPE, SCALAR_TYPE> (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file);
-
-
-        typedef Kokkos::OpenMP myExecSpace;
-        typedef typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace, void, SIZE_TYPE > crsMat_t;
-
-        typedef typename crsMat_t::StaticCrsGraphType graph_t;
-        typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t;
-        typedef typename crsMat_t::index_type::non_const_type   cols_view_t;
-        typedef typename crsMat_t::values_type::non_const_type values_view_t;
-
-        row_map_view_t rowmap_view("rowmap_view", nv+1);
-        cols_view_t columns_view("colsmap_view", ne);
-        values_view_t values_view("values_view", ne);
-
-        KokkosKernels::Impl::copy_vector<SCALAR_TYPE * , values_view_t, myExecSpace>(ne, ew, values_view);
-        KokkosKernels::Impl::copy_vector<INDEX_TYPE * , cols_view_t, myExecSpace>(ne, adj, columns_view);
-        KokkosKernels::Impl::copy_vector<INDEX_TYPE * , row_map_view_t, myExecSpace>(nv+1, xadj, rowmap_view);
-
-        graph_t static_graph (columns_view, rowmap_view);
-        crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph);
-
-        //crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj);
-        delete [] xadj;
-        delete [] adj;
-        delete [] ew;
-
-        run_experiment<myExecSpace, crsMat_t>(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]);
-      }
-      Kokkos::finalize();
-    }
+    if(cmdline[CMD_USE_OPENMP])
+      run_pcg<Kokkos::OpenMP>(cmdline, mtx_file);
 #endif
-
 #if defined( KOKKOS_ENABLE_CUDA )
-    if ( cmdline[ CMD_USE_CUDA ] ) {
-
-      Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space
-
-      // Use the last device:
-      init_args.device_id = cmdline[ CMD_USE_CUDA_DEV ];
-
-      Kokkos::initialize( init_args );
-      Kokkos::print_configuration(std::cout);
-      {
-        INDEX_TYPE nv = 0, ne = 0;
-        INDEX_TYPE *xadj, *adj;
-        SCALAR_TYPE *ew;
-
-        KokkosKernels::Impl::read_matrix<INDEX_TYPE,INDEX_TYPE, SCALAR_TYPE> (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file);
-
-
-        typedef Kokkos::Cuda myExecSpace;
-        typedef typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, myExecSpace, void, SIZE_TYPE > crsMat_t;
-
-        typedef typename crsMat_t::StaticCrsGraphType graph_t;
-        typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t;
-        typedef typename crsMat_t::index_type::non_const_type   cols_view_t;
-        typedef typename crsMat_t::values_type::non_const_type values_view_t;
-
-        row_map_view_t rowmap_view("rowmap_view", nv+1);
-        cols_view_t columns_view("colsmap_view", ne);
-        values_view_t values_view("values_view", ne);
-
-
-        {
-          typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view);
-          typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view);
-          typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view);
-
-          for (INDEX_TYPE i = 0; i <= nv; ++i){
-            hr(i) = xadj[i];
-          }
-
-          for (INDEX_TYPE i = 0; i < ne; ++i){
-            hc(i) = adj[i];
-            hv(i) = ew[i];
-          }
-          Kokkos::deep_copy (rowmap_view , hr);
-          Kokkos::deep_copy (columns_view , hc);
-          Kokkos::deep_copy (values_view , hv);
-
-
-        }
-        graph_t static_graph (columns_view, rowmap_view);
-        crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph);
-
-  //      typedef typename KokkosSparse::CrsMatrix<SCALAR_TYPE, INDEX_TYPE, Kokkos::Cuda> crsMat_t;
-  //      crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj);
-        delete [] xadj;
-        delete [] adj;
-        delete [] ew;
-
-        run_experiment<myExecSpace, crsMat_t>(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]);
-      }
-      Kokkos::finalize();
-    }
+    if(cmdline[CMD_USE_CUDA])
+      run_pcg<Kokkos::Cuda>(cmdline, mtx_file);
 #endif
-
+#if defined( KOKKOS_ENABLE_HIP )
+    if(cmdline[CMD_USE_HIP])
+      run_pcg<Kokkos::Experimental::HIP>(cmdline, mtx_file);
+#endif
+  }
+  Kokkos::finalize();
   return 0;
 }
-#else
-int main() {
-}
-#endif
diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp
index f90c6179f7..959e9d973c 100644
--- a/perf_test/sparse/KokkosSparse_spadd.cpp
+++ b/perf_test/sparse/KokkosSparse_spadd.cpp
@@ -60,7 +60,7 @@
 void print_options(){
   std::cerr << "Options\n" << std::endl;
 
-  std::cerr << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]'" << std::endl;
+  std::cerr << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" << std::endl;
 
   std::cerr << "\t[Required] --amtx <path> :: 1st input matrix" << std::endl;
   std::cerr << "\t[Required] --bmtx <path> :: 2nd input matrix" << std::endl;
diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp
index 80e4ab7c34..0f1c9f6210 100644
--- a/perf_test/sparse/KokkosSparse_spgemm.cpp
+++ b/perf_test/sparse/KokkosSparse_spgemm.cpp
@@ -52,7 +52,7 @@ void print_options(){
 
   std::cerr << "\t[Required] INPUT MATRIX: '--amtx [left_hand_side.mtx]' -- for C=AxA" << std::endl;
 
-  std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' --> if none are specified, Serial is used (if enabled)" << std::endl;
+  std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]' --> if none are specified, Serial is used (if enabled)" << std::endl;
   std::cerr << "\t[Optional] '--algorithm [DEFAULT=KKDEFAULT=KKSPGEMM|KKMEM|KKDENSE|MKL|CUSPARSE|CUSP|VIENNA|MKL2]' --> to choose algorithm. KKMEM is outdated, use KKSPGEMM instead." << std::endl;
   std::cerr << "\t[Optional] --bmtx [righ_hand_side.mtx]' for C = AxB" << std::endl;
   std::cerr << "\t[Optional] OUTPUT MATRICES: '--cmtx [output_matrix.mtx]' --> to write output C=AxB"  << std::endl;
@@ -84,6 +84,9 @@ int parse_inputs (KokkosKernels::Experiment::Parameters &params, int argc, char
     else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
       params.use_cuda = atoi(getNextArg(i, argc, argv)) + 1;
     }
+    else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) {
+      params.use_hip = atoi(getNextArg(i, argc, argv)) + 1;
+    }
     else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) {
       params.repeat = atoi(getNextArg(i, argc, argv));
     }
@@ -297,7 +300,7 @@ int main (int argc, char ** argv){
   }
 
   const int num_threads = std::max(params.use_openmp, params.use_threads);
-  const int device_id = params.use_cuda - 1;
+  const int device_id = params.use_cuda ? params.use_cuda - 1 : params.use_hip - 1;
 
   Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) );
   Kokkos::print_configuration(std::cout);
@@ -336,6 +339,16 @@ int main (int argc, char ** argv){
   }
 #endif
 
+#if defined( KOKKOS_ENABLE_HIP )
+  if (params.use_hip) {
+    KokkosKernels::Experiment::run_multi_mem_spgemm
+    <size_type, lno_t, scalar_t, Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(
+        params
+        );
+
+  }
+#endif
+
 #if defined( KOKKOS_ENABLE_THREADS )
   //If only serial is enabled (or no other device was specified), run with serial
   if (params.use_threads)
diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp
index 3a6ea1cca5..6e4af2c7b3 100644
--- a/src/Kokkos_ArithTraits.hpp
+++ b/src/Kokkos_ArithTraits.hpp
@@ -50,6 +50,7 @@
 
 #include <KokkosKernels_config.h>
 #include <Kokkos_Complex.hpp>
+#include <Kokkos_Macros.hpp>
 
 #ifdef HAVE_KOKKOSKERNELS_QUADMATH
 #  include <quadmath.h>
@@ -63,16 +64,6 @@
 #ifdef __CUDACC__
 #  include <math_constants.h>
 #endif
-//
-// mfh 24 Dec 2013: Temporary measure for testing; will go away.
-//
-#ifndef KOKKOS_FORCEINLINE_FUNCTION
-#  ifdef __CUDA_ARCH__
-#    define KOKKOS_FORCEINLINE_FUNCTION inline __host__ __device__
-#  else
-#    define KOKKOS_FORCEINLINE_FUNCTION
-#  endif // __CUDA_ARCH__
-#endif // KOKKOS_FORCEINLINE_FUNCTION
 
 namespace { // anonymous
 
diff --git a/src/batched/KokkosBatched_Gemm_Team_Internal.hpp b/src/batched/KokkosBatched_Gemm_Team_Internal.hpp
index f4f682cb91..c7e7613769 100644
--- a/src/batched/KokkosBatched_Gemm_Team_Internal.hpp
+++ b/src/batched/KokkosBatched_Gemm_Team_Internal.hpp
@@ -5,6 +5,7 @@
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
+#include "KokkosKernels_ExecSpaceUtils.hpp"
 
 #include "KokkosBatched_Set_Internal.hpp"
 #include "KokkosBatched_Scale_Internal.hpp"
@@ -111,7 +112,7 @@ namespace KokkosBatched {
         member.team_barrier();
 
       ///
-      /// case cuda: team size is large and blocksize (mb,nb) is small
+      /// GPU case: team size is large and blocksize (mb,nb) is small
       InnerGemmFixC<mbAlgo,nbAlgo> inner(as0, as1, bs0, bs1, cs0, cs1);
       auto gemm = [&](const int ib, 
                       const int jb,
@@ -128,13 +129,16 @@ namespace KokkosBatched {
         Kokkos::parallel_for
         (Kokkos::TeamThreadRange(member, mq*nq ),
          [&](const int &ij) {
-#if                                                     \
-  defined (KOKKOS_ENABLE_CUDA) &&                       \
-  defined (KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
-          const int i = ij%mq*mb, j = ij/mq*nb;
-#else
-          const int i = ij/nq*mb, j = ij%nq*nb;
-#endif
+          int i, j;
+          //note: the condition is constexpr
+          if(KokkosKernels::Impl::kk_is_gpu_exec_space<typename MemberType::execution_space>()) {
+            i = ij%mq*mb;
+            j = ij/mq*nb;
+          }
+          else {
+            i = ij/nq*mb;
+            j = ij%nq*nb;
+          }
           inner.serial_invoke(alpha, 
                               AA+i*as0, BB+j*bs1, 
                               (i+mb) > ib ? mp : mb, 
diff --git a/src/batched/KokkosBatched_Trsm_Team_Internal.hpp b/src/batched/KokkosBatched_Trsm_Team_Internal.hpp
index 085bd9e293..64d8368f16 100644
--- a/src/batched/KokkosBatched_Trsm_Team_Internal.hpp
+++ b/src/batched/KokkosBatched_Trsm_Team_Internal.hpp
@@ -5,6 +5,7 @@
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
 #include "KokkosBatched_Util.hpp"
+#include "KokkosKernels_ExecSpaceUtils.hpp"
 
 #include "KokkosBatched_Set_Internal.hpp"
 #include "KokkosBatched_Scale_Internal.hpp"
@@ -114,7 +115,7 @@ namespace KokkosBatched {
       /// case host: team size is small and blocksize (mb,nb) is large
             
       ///
-      /// case cuda: team size is large and blocksize (mb,nb) is small
+      /// case GPU: team size is large and blocksize (mb,nb) is small
       InnerTrsmLeftLowerUnitDiag<mbAlgo>    trsm_u(as0, as1, bs0, bs1);
       InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, bs1);
             
@@ -195,7 +196,6 @@ namespace KokkosBatched {
          const ScalarType alpha,
          const ValueType *__restrict__ A, const int as0, const int as1,
          /**/  ValueType *__restrict__ B, const int bs0, const int bs1) {
-
     const ScalarType one(1.0), zero(0.0);
 
     // note that parallel range is different ( m*n vs m-1*n);        
@@ -223,13 +223,15 @@ namespace KokkosBatched {
         }
 
         Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,iend*jend),[&](const int &ij) {
-#if							\
-  defined (KOKKOS_ENABLE_CUDA) &&                       \
-  defined (KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
-            const int i = ij%iend, j = ij/iend;
-#else
-            const int i = ij/jend, j = ij%jend;
-#endif
+            int i, j;
+            if(KokkosKernels::Impl::kk_is_gpu_exec_space<typename MemberType::execution_space>()) {
+              i = ij%iend;
+              j = ij/iend;
+            }
+            else {
+              i = ij/jend;
+              j = ij%jend;
+            }
             B0[i*bs0+j*bs1] -= a01[i*as0] * b1t[j*bs1];
           });          
       }
diff --git a/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp b/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp
index 618f8dc614..5bf26f0865 100644
--- a/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp
+++ b/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp
@@ -99,7 +99,7 @@ namespace KokkosBatched {
       if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0);
       if (m <= 0) return 0;
 
-      /// case cuda: team size is large and blocksize (mb,nb) is small
+      /// case GPU: team size is large and blocksize (mb,nb) is small
       InnerTrsmLeftLowerUnitDiag<mbAlgo>    trsm_u(as0, as1, bs0, 0);
       InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, 0);
 
diff --git a/src/batched/KokkosBatched_Trsv_Team_Internal.hpp b/src/batched/KokkosBatched_Trsv_Team_Internal.hpp
index 20ee624006..7d72f01e15 100644
--- a/src/batched/KokkosBatched_Trsv_Team_Internal.hpp
+++ b/src/batched/KokkosBatched_Trsv_Team_Internal.hpp
@@ -115,7 +115,7 @@ namespace KokkosBatched {
       if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0);
       if (m <= 0) return 0;
 
-      /// case cuda: team size is large and blocksize (mb,nb) is small
+      /// case GPU: team size is large and blocksize (mb,nb) is small
       InnerTrsmLeftLowerUnitDiag<mbAlgo>    trsm_u(as0, as1, bs0, 0);
       InnerTrsmLeftLowerNonUnitDiag<mbAlgo> trsm_n(as0, as1, bs0, 0);
             
diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp
index 2347c63e87..6d6fe4edbd 100644
--- a/src/batched/KokkosBatched_Util.hpp
+++ b/src/batched/KokkosBatched_Util.hpp
@@ -270,7 +270,7 @@ namespace KokkosBatched {
         // regieter blocking (not about team parallelism).
         // this mb should vary according to
         // - team policy (smaller) or range policy (bigger)
-        // - space (cuda vs host)
+        // - space (gpu vs host)
         // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc.
 #if defined(KOKKOS_ENABLE_CUDA)
         template<typename ActiveMemorySpaceType> KOKKOS_INLINE_FUNCTION static constexpr
diff --git a/src/batched/KokkosBatched_Vector.hpp b/src/batched/KokkosBatched_Vector.hpp
index 8737d72850..28a537f885 100644
--- a/src/batched/KokkosBatched_Vector.hpp
+++ b/src/batched/KokkosBatched_Vector.hpp
@@ -104,6 +104,25 @@ namespace KokkosBatched {
   };
 #endif
 
+#if defined(KOKKOS_ENABLE_HIP)
+  template<>
+  struct DefaultVectorLength<float,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 16 };
+  };
+  template<>
+  struct DefaultVectorLength<double,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 16 };
+  };
+  template<>
+  struct DefaultVectorLength<Kokkos::complex<float>,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 16 };
+  };
+  template<>
+  struct DefaultVectorLength<Kokkos::complex<double>,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 16 };
+  };
+#endif
+
   template<typename ValueType, typename MemorySpace>
   struct DefaultInternalVectorLength {
     enum : int { value = 1 };
@@ -147,6 +166,25 @@ namespace KokkosBatched {
     enum : int { value = 1 };
   };
 #endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+  template<>
+  struct DefaultInternalVectorLength<float,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 8 };
+  };
+  template<>
+  struct DefaultInternalVectorLength<double,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 4 };
+  };
+  template<>
+  struct DefaultInternalVectorLength<Kokkos::complex<float>,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 4 };
+  };
+  template<>
+  struct DefaultInternalVectorLength<Kokkos::complex<double>,Kokkos::Experimental::HIPSpace> {
+    enum : int { value = 2 };
+  };
+#endif
     
   template<typename T>
   struct MagnitudeScalarType;
diff --git a/src/batched/KokkosBatched_Vector_SIMD.hpp b/src/batched/KokkosBatched_Vector_SIMD.hpp
index d59f0f9be4..e8fe83b7e2 100644
--- a/src/batched/KokkosBatched_Vector_SIMD.hpp
+++ b/src/batched/KokkosBatched_Vector_SIMD.hpp
@@ -129,7 +129,7 @@ namespace KokkosBatched {
 }
 
 
-#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP))
 namespace KokkosBatched {
 
   template<>
@@ -143,7 +143,7 @@ namespace KokkosBatched {
     typedef float2 data_type;
 
     KOKKOS_INLINE_FUNCTION
-    static const char* label() { return "CudaFloat2"; }
+    static const char* label() { return "GpuFloat2"; }
 
     template<typename,int>
     friend class Vector;
@@ -224,7 +224,7 @@ namespace KokkosBatched {
     typedef double2 data_type;
 
     KOKKOS_INLINE_FUNCTION
-    static const char* label() { return "CudaDouble2"; }
+    static const char* label() { return "GpuDouble2"; }
 
     template<typename,int>
     friend class Vector;
@@ -305,7 +305,7 @@ namespace KokkosBatched {
     typedef float4 data_type;
 
     KOKKOS_INLINE_FUNCTION
-    static const char* label() { return "CudaFloat4"; }
+    static const char* label() { return "GpuFloat4"; }
 
     template<typename,int>
     friend class Vector;
@@ -400,7 +400,7 @@ namespace KokkosBatched {
     typedef double4 data_type;
 
     KOKKOS_INLINE_FUNCTION
-    static const char* label() { return "CudaDouble4"; }
+    static const char* label() { return "GpuDouble4"; }
 
     template<typename,int>
     friend class Vector;
diff --git a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp
index 95ab97d882..43ddbb101b 100644
--- a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp
+++ b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp
@@ -77,7 +77,7 @@ namespace KokkosBatched {
     return r_val;
   }
     
-#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)  
+#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP))
   KOKKOS_FORCEINLINE_FUNCTION 
   static
   KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
@@ -298,7 +298,7 @@ namespace KokkosBatched {
     return r_val;
   }
 
-#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)  
+#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP))
   KOKKOS_FORCEINLINE_FUNCTION
   static 
   KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
@@ -568,7 +568,7 @@ namespace KokkosBatched {
     return r_val;
   }
 
-#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)  
+#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP))
   KOKKOS_FORCEINLINE_FUNCTION
   static 
   KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
@@ -858,7 +858,7 @@ namespace KokkosBatched {
     return r_val;
   }
 
-#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)  
+#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP))
   KOKKOS_FORCEINLINE_FUNCTION
   static 
   KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp
index 74d15af1c3..db5bc9fbca 100644
--- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp
+++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp
@@ -139,8 +139,8 @@ struct SingleLevelNontransposeGEMV {
 // matrix A and the input vector x.  The output vector y is the
 // reduction result.
 //
-// WARNING: NOT RECOMMENDED FOR CUDA.  Reduction result may have
-// arbitrary length.  This is bad on CUDA because the CUDA
+// WARNING: NOT RECOMMENDED FOR GPU.  Reduction result may have
+// arbitrary length.  This is bad on GPU because the GPU
 // implementation of Kokkos::parallel_reduce may use shared memory for
 // intermediate results.
 template<class AViewType,
diff --git a/src/blas/impl/KokkosBlas3_gemm_impl.hpp b/src/blas/impl/KokkosBlas3_gemm_impl.hpp
index 3f4bbadef6..2e50a0064c 100644
--- a/src/blas/impl/KokkosBlas3_gemm_impl.hpp
+++ b/src/blas/impl/KokkosBlas3_gemm_impl.hpp
@@ -74,6 +74,13 @@ struct impl_gemm_choose_copy_layout<Kokkos::Cuda,LayoutA,LayoutAScratch> {
 };
 #endif
 
+#ifdef KOKKOS_ENABLE_HIP
+template<class LayoutA, class LayoutAScratch>
+struct impl_gemm_choose_copy_layout<Kokkos::Experimental::HIP,LayoutA,LayoutAScratch> {
+  typedef LayoutA type;
+};
+#endif
+
 // DeepCopy matrix block into scratch
 template<class TeamHandle, class ViewTypeScratch, class ViewType, class Layout, int blockDim_i, int blockDim_j, int Transpose>
 struct impl_deep_copy_matrix_block;
diff --git a/src/blas/impl/KokkosBlas3_gemm_spec.hpp b/src/blas/impl/KokkosBlas3_gemm_spec.hpp
index 877d73c5fa..2a63c3736f 100644
--- a/src/blas/impl/KokkosBlas3_gemm_spec.hpp
+++ b/src/blas/impl/KokkosBlas3_gemm_spec.hpp
@@ -157,6 +157,10 @@ struct GEMM {
   if(std::is_same<typename CViewType::execution_space,Kokkos::Cuda>::value)
     team_size = blockA0;
   #endif
+  #if defined(KOKKOS_ENABLE_HIP)
+  if(std::is_same<typename CViewType::execution_space,Kokkos::Experimental::HIP>::value)
+    team_size = blockA0;
+  #endif
   #if defined(KOKKOS_ENABLE_ROCM)
   if(std::is_same<typename CViewType::execution_space,Kokkos::ROCm>::value)
     team_size = blockA0;
diff --git a/src/common/KokkosKernels_BitUtils.hpp b/src/common/KokkosKernels_BitUtils.hpp
index b22d86a8bb..28b2a01389 100644
--- a/src/common/KokkosKernels_BitUtils.hpp
+++ b/src/common/KokkosKernels_BitUtils.hpp
@@ -51,6 +51,7 @@ namespace KokkosKernels{
 namespace Impl{
 
 // POP COUNT function returns the number of set bits
+// Note BMK: HIP also defines __CUDA_ARCH__, and provides the same intrinsics.
 #if defined( __CUDA_ARCH__ )
 KOKKOS_FORCEINLINE_FUNCTION
 int pop_count( unsigned i ){
diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp
index c0ae6ce5eb..22930c82e1 100644
--- a/src/common/KokkosKernels_ExecSpaceUtils.hpp
+++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp
@@ -53,9 +53,9 @@ namespace KokkosKernels{
 
 namespace Impl{
 
-enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA};
+enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA, Exec_HIP};
 template <typename ExecutionSpace>
-inline ExecSpaceType kk_get_exec_space_type(){
+constexpr KOKKOS_INLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){
   ExecSpaceType exec_space = Exec_SERIAL;
 #if defined( KOKKOS_ENABLE_SERIAL )
   if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
@@ -81,6 +81,12 @@ inline ExecSpaceType kk_get_exec_space_type(){
   }
 #endif
 
+#if defined( KOKKOS_ENABLE_HIP )
+  if (std::is_same<Kokkos::Experimental::HIP, ExecutionSpace >::value){
+    exec_space = Exec_HIP;
+  }
+#endif
+
 #if defined( KOKKOS_ENABLE_QTHREAD)
   if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
     exec_space = Exec_QTHREADS;
@@ -90,6 +96,48 @@ inline ExecSpaceType kk_get_exec_space_type(){
 
 }
 
+template <typename ExecutionSpace>
+constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() {
+  auto exec = kk_get_exec_space_type<ExecutionSpace>();
+  //TODO BMK: Add OpenMPTarget and any other future GPU exec spaces
+  return exec == Exec_CUDA || exec == Exec_HIP;
+}
+
+//Host function to determine free and total device memory.
+//Will throw if execution space doesn't support this.
+template <typename MemorySpace>
+inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem)
+{
+  std::ostringstream oss;
+  oss << "Error: memory space " << MemorySpace::name() << " does not support querying free/total memory.";
+  throw std::runtime_error(oss.str());
+}
+
+#ifdef KOKKOS_ENABLE_CUDA
+template <>
+inline void kk_get_free_total_memory<Kokkos::CudaSpace>(size_t& free_mem, size_t& total_mem)
+{
+  cudaMemGetInfo(&free_mem, &total_mem);
+}
+template <>
+inline void kk_get_free_total_memory<Kokkos::CudaUVMSpace>(size_t& free_mem, size_t& total_mem)
+{
+  cudaMemGetInfo(&free_mem, &total_mem);
+}
+template <>
+inline void kk_get_free_total_memory<Kokkos::CudaHostPinnedSpace>(size_t& free_mem, size_t& total_mem)
+{
+  cudaMemGetInfo(&free_mem, &total_mem);
+}
+#endif
+
+#ifdef KOKKOS_ENABLE_HIP
+template <>
+inline void kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(size_t& free_mem, size_t& total_mem)
+{
+  hipMemGetInfo(&free_mem, &total_mem);
+}
+#endif
 
 inline int kk_get_suggested_vector_size(
     const size_t nr, const  size_t nnz, const ExecSpaceType exec_space){
@@ -103,7 +151,7 @@ inline int kk_get_suggested_vector_size(
   case Exec_QTHREADS:
     break;
   case Exec_CUDA:
-
+  case Exec_HIP:
     if (nr > 0)
       suggested_vector_size_ = nnz / double (nr) + 0.5;
     if (suggested_vector_size_ < 3){
@@ -119,7 +167,14 @@ inline int kk_get_suggested_vector_size(
       suggested_vector_size_ = 16;
     }
     else {
-      suggested_vector_size_ = 32;
+      if(exec_space == Exec_CUDA || suggested_vector_size_ <= 48) {
+        //use full CUDA warp, or half a HIP wavefront
+        suggested_vector_size_ = 32;
+      }
+      else {
+        //use full HIP wavefront
+        suggested_vector_size_ = 64;
+      }
     }
     break;
   }
@@ -129,7 +184,9 @@ inline int kk_get_suggested_vector_size(
 
 
 inline int kk_get_suggested_team_size(const int vector_size, const ExecSpaceType exec_space){
-  if (exec_space == Exec_CUDA){
+  if (exec_space == Exec_CUDA || exec_space == Exec_HIP) {
+    //TODO: where this is used, tune the target value for
+    //threads per block (but 256 is probably OK for CUDA and HIP)
     return 256 / vector_size;
   }
   else {
@@ -171,6 +228,25 @@ struct SpaceInstance<Kokkos::Cuda> {
 };
 #endif
 
+#ifdef KOKKOS_ENABLE_HIP
+template <>
+struct SpaceInstance<Kokkos::Experimental::HIP> {
+  static Kokkos::Experimental::HIP create() {
+    hipStream_t stream;
+    hipStreamCreate(&stream);
+    return Kokkos::Experimental::HIP(stream);
+  }
+  static void destroy(Kokkos::Experimental::HIP& space) {
+    hipStream_t stream = space.hip_stream();
+    hipStreamDestroy(stream);
+  }
+  static bool overlap() {
+    //TODO: does HIP have an equivalent for CUDA_LAUNCH_BLOCKING?
+    return true;
+  }
+};
+#endif
+
 }
 
 }
diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp
index 9d43ba670c..2e335d4f04 100644
--- a/src/common/KokkosKernels_Handle.hpp
+++ b/src/common/KokkosKernels_Handle.hpp
@@ -371,7 +371,7 @@ class KokkosKernelsHandle
       return this->team_work_size;
     }
     else {
-      if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+      if (my_exec_space == KokkosKernels::Impl::Exec_CUDA || my_exec_space == KokkosKernels::Impl::Exec_HIP) {
         return team_size;
       }
       else {
diff --git a/src/common/KokkosKernels_Macros.hpp b/src/common/KokkosKernels_Macros.hpp
index 84de9048c9..ced946fe4f 100644
--- a/src/common/KokkosKernels_Macros.hpp
+++ b/src/common/KokkosKernels_Macros.hpp
@@ -46,10 +46,10 @@
 #define _KOKKOSKERNELS_MACROUTILS_HPP_
 
 // If KOKKOSKERNELS_ENABLE_OMP_SIMD is defined, it's legal to place
-// "#pragma omp simd" before a for loop. It's never defined if CUDA is enabled,
+// "#pragma omp simd" before a for loop. It's never defined if a GPU-type device is enabled,
 // since in that case, Kokkos::ThreadVectorRange should be used instead for SIMD parallel loops.
 
-#if !defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_OPENMP)
+#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ENABLE_OPENMP)
   #if defined(KOKKOS_COMPILER_GNU)
     // GCC 4.8.5 and older do not support #pragma omp simd
     #if (KOKKOS_COMPILER_GNU > 485 )
diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp
index 2547c2e1b9..7628e6de31 100644
--- a/src/common/KokkosKernels_SparseUtils.hpp
+++ b/src/common/KokkosKernels_SparseUtils.hpp
@@ -1041,12 +1041,7 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const val
 {
   using lno_t = typename entries_t::non_const_value_type;
   using team_pol = Kokkos::TeamPolicy<execution_space>;
-#ifdef KOKKOS_ENABLE_CUDA
-  //only CUDA benefits from using team-based bitonic
-  bool useRadix = std::is_same<execution_space, Kokkos::Cuda>::value ? false : true;
-#else
-  bool useRadix = true;
-#endif
+  bool useRadix = !kk_is_gpu_exec_space<execution_space>();
   SortCrsMatrixFunctor<execution_space, rowmap_t, entries_t, values_t>
     funct(useRadix, rowmap, entries, values);
   lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
@@ -1094,12 +1089,7 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries)
 {
   using lno_t = typename entries_t::non_const_value_type;
   using team_pol = Kokkos::TeamPolicy<execution_space>;
-#ifdef KOKKOS_ENABLE_CUDA
-  //only CUDA benefits from using team-based bitonic
-  bool useRadix = std::is_same<execution_space, Kokkos::Cuda>::value ? false : true;
-#else
-  bool useRadix = true;
-#endif
+  bool useRadix = !kk_is_gpu_exec_space<execution_space>();
   SortCrsGraphFunctor<execution_space, rowmap_t, entries_t>
     funct(useRadix, rowmap, entries);
   lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
@@ -1353,74 +1343,45 @@ void kk_sort_graph(
     out_scalar_view_t out_vals){
   ExecSpaceType exec = kk_get_exec_space_type<MyExecSpace>();
 
-  if (exec == Exec_CUDA){
-    typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj);
-    Kokkos::deep_copy (hr, in_xadj);
-    typename lno_nnz_view_t::HostMirror he = Kokkos::create_mirror_view (in_adj);
-    Kokkos::deep_copy (he, in_adj);
-    typename scalar_view_t::HostMirror hv = Kokkos::create_mirror_view (in_vals);
-    Kokkos::deep_copy (hv, in_vals);
-    MyExecSpace().fence();
-
-    typename lno_nnz_view_t::HostMirror heo = Kokkos::create_mirror_view (out_adj);
-    typename scalar_view_t::HostMirror hvo = Kokkos::create_mirror_view (out_vals);
+  // If possible, sort on host and avoid a deep copy
+  // TODO BMK: can this function be deprecated?
+  typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj);
+  Kokkos::deep_copy (hr, in_xadj);
+  typename lno_nnz_view_t::HostMirror he = Kokkos::create_mirror_view (in_adj);
+  Kokkos::deep_copy (he, in_adj);
+  typename scalar_view_t::HostMirror hv = Kokkos::create_mirror_view (in_vals);
+  Kokkos::deep_copy (hv, in_vals);
+  MyExecSpace().fence();
 
+  typename lno_nnz_view_t::HostMirror heo = Kokkos::create_mirror_view (out_adj);
+  typename scalar_view_t::HostMirror hvo = Kokkos::create_mirror_view (out_vals);
 
-    typedef typename lno_view_t::non_const_value_type size_type;
-    typedef typename lno_nnz_view_t::non_const_value_type lno_t;
-    typedef typename scalar_view_t::non_const_value_type scalar_t;
+  typedef typename lno_view_t::non_const_value_type size_type;
+  typedef typename lno_nnz_view_t::non_const_value_type lno_t;
+  typedef typename scalar_view_t::non_const_value_type scalar_t;
 
-    lno_t nrows = in_xadj.extent(0) - 1;
-    std::vector <Edge<lno_t, scalar_t> > edges(in_adj.extent(0));
+  lno_t nrows = in_xadj.extent(0) - 1;
+  std::vector <Edge<lno_t, scalar_t> > edges(in_adj.extent(0));
 
-    size_type row_size = 0;
-    for (lno_t i = 0; i < nrows; ++i){
-      for (size_type j = hr(i); j < hr(i + 1); ++j){
-        edges[row_size].src = i;
-        edges[row_size].dst = he(j);
-        edges[row_size++].ew = hv(j);
-      }
-    }
-    std::sort (edges.begin(), edges.begin() + row_size);
-    size_type ne = in_adj.extent(0);
-    for(size_type i = 0; i < ne; ++i){
-      heo(i) = edges[i].dst;
-      hvo(i) = edges[i].ew;
+  size_type row_size = 0;
+  for (lno_t i = 0; i < nrows; ++i){
+    for (size_type j = hr(i); j < hr(i + 1); ++j){
+      edges[row_size].src = i;
+      edges[row_size].dst = he(j);
+      edges[row_size++].ew = hv(j);
     }
-
-
-    Kokkos::deep_copy (out_adj, heo);
-    Kokkos::deep_copy (out_vals, hvo);
-    MyExecSpace().fence();
   }
-  else {
-
-
-    typedef typename lno_view_t::non_const_value_type size_type;
-    typedef typename lno_nnz_view_t::non_const_value_type lno_t;
-    typedef typename scalar_view_t::non_const_value_type scalar_t;
-
-    lno_t nrows = in_xadj.extent(0) - 1;
-    std::vector <Edge<lno_t, scalar_t> > edges(in_adj.extent(0));
-
-    size_type row_size = 0;
-    for (lno_t i = 0; i < nrows; ++i){
-      for (size_type j = in_xadj(i); j < in_xadj(i + 1); ++j){
-        edges[row_size].src = i;
-        edges[row_size].dst = in_adj(j);
-        edges[row_size++].ew = in_vals(j);
-      }
-    }
-    std::sort (edges.begin(), edges.begin() + row_size);
-    size_type ne = in_adj.extent(0);
-    for(size_type i = 0; i < ne; ++i){
-      out_adj(i) = edges[i].dst;
-      out_vals(i) = edges[i].ew;
-    }
-
+  std::sort (edges.begin(), edges.begin() + row_size);
+  size_type ne = in_adj.extent(0);
+  for(size_type i = 0; i < ne; ++i){
+    heo(i) = edges[i].dst;
+    hvo(i) = edges[i].ew;
+  }
 
 
-  }
+  Kokkos::deep_copy (out_adj, heo);
+  Kokkos::deep_copy (out_vals, hvo);
+  MyExecSpace().fence();
 }
 
 /*
@@ -1714,47 +1675,46 @@ struct LowerTriangularMatrix{
       const size_type write_end = t_xadj[row_index + 1];
       const lno_t write_left_work = write_end - write_begin;
 
-      switch (exec_space){
-      case Exec_CUDA:
-        //TODO: Write cuda version here.
-        /*
+      //TODO: Write GPU (vector-level) version here:
+      /*
+      if(kk_is_gpu_exec_space<ExecutionSpace>())
+      {
         Kokkos::parallel_for(
             Kokkos::ThreadVectorRange(teamMember, read_left_work),
             [&] (lno_t i) {
           const size_type adjind = i + col_begin;
           const lno_t colIndex = adj[adjind];
-
         });
-        */
+      }
+      else
+      ...
+      */
 
-      default:
-        for (lno_t r = 0 , w = 0; r <  read_left_work && w < write_left_work; ++r){
-          const size_type adjind = r + col_begin;
-          const lno_t colIndex = adj[adjind];
-          lno_t colperm = colIndex;
-          if (permutation != NULL){
-            colperm = permutation[colIndex];
-          }
-          if (is_lower){
-            if (row_perm > colperm){
-              if (in_vals != NULL){
-                t_vals[write_begin + w] = in_vals[adjind];
-              }
-              t_adj[write_begin + w++] = colIndex;
+      for (lno_t r = 0 , w = 0; r <  read_left_work && w < write_left_work; ++r){
+        const size_type adjind = r + col_begin;
+        const lno_t colIndex = adj[adjind];
+        lno_t colperm = colIndex;
+        if (permutation != NULL){
+          colperm = permutation[colIndex];
+        }
+        if (is_lower){
+          if (row_perm > colperm){
+            if (in_vals != NULL){
+              t_vals[write_begin + w] = in_vals[adjind];
             }
+            t_adj[write_begin + w++] = colIndex;
           }
-          else {
-            if (row_perm < colperm){
-              if (in_vals != NULL){
-                t_vals[write_begin + w] = in_vals[adjind];
-              }
-              t_adj[write_begin + w++] = colIndex;
+        }
+        else {
+          if (row_perm < colperm){
+            if (in_vals != NULL){
+              t_vals[write_begin + w] = in_vals[adjind];
             }
+            t_adj[write_begin + w++] = colIndex;
           }
+        }
 
 
-        }
-        break;
       }
     });
   }
@@ -2340,7 +2300,6 @@ void kk_create_incidence_tranpose_matrix_from_lower_triangle(
     bool use_dynamic_scheduling = false,
     bool chunksize = 4){
 
-#ifndef KOKKOS_ENABLE_CUDA
   //typedef typename row_map_view_t::const_type const_row_map_view_t;
   //typedef typename cols_view_t::const_type   const_cols_view_t;
 
@@ -2381,7 +2340,6 @@ void kk_create_incidence_tranpose_matrix_from_lower_triangle(
     }
 
     });
-#endif
   }
 
 template <typename row_map_view_t,
@@ -2398,7 +2356,6 @@ void kk_create_incidence_matrix_from_lower_triangle(
     out_cols_view_t &out_entries,
     bool use_dynamic_scheduling = false,
     bool chunksize = 4){
-#ifndef KOKKOS_ENABLE_CUDA
 
   //typedef typename row_map_view_t::const_type const_row_map_view_t;
   //typedef typename cols_view_t::const_type   const_cols_view_t;
@@ -2470,8 +2427,7 @@ void kk_create_incidence_matrix_from_lower_triangle(
       tmp);
 
       out_entries = outcols;
-#endif
-  }
+}
 
 
 
@@ -2491,7 +2447,6 @@ void kk_create_incidence_matrix_from_original_matrix(
     permutation_view_t permutation,
     bool use_dynamic_scheduling = false,
     bool chunksize = 4){
-#ifndef KOKKOS_ENABLE_CUDA
 
   //typedef typename row_map_view_t::const_type const_row_map_view_t;
   //typedef typename cols_view_t::const_type   const_cols_view_t;
@@ -2612,7 +2567,6 @@ void kk_create_incidence_matrix_from_original_matrix(
       tmp);
 
       out_entries = outcols;*/
-#endif
   }
 
 
diff --git a/src/common/KokkosKernels_Uniform_Initialized_MemoryPool.hpp b/src/common/KokkosKernels_Uniform_Initialized_MemoryPool.hpp
index 3269c578ca..ae413d0fd5 100644
--- a/src/common/KokkosKernels_Uniform_Initialized_MemoryPool.hpp
+++ b/src/common/KokkosKernels_Uniform_Initialized_MemoryPool.hpp
@@ -167,7 +167,8 @@ class UniformMemoryPool{
   PoolType pool_type;
 
 public:
-
+  using execution_space = typename MyExecSpace::execution_space;
+  using memory_space = typename MyExecSpace::memory_space;
 
   /**
    * \brief UniformMemoryPool constructor.
diff --git a/src/common/KokkosKernels_Utils.hpp b/src/common/KokkosKernels_Utils.hpp
index 80d22ec4b0..544b57f989 100644
--- a/src/common/KokkosKernels_Utils.hpp
+++ b/src/common/KokkosKernels_Utils.hpp
@@ -93,57 +93,8 @@ void get_histogram(
 template <typename idx, typename ExecutionSpace>
 void get_suggested_vector_size(
     int &suggested_vector_size_,
-    idx nr, idx nnz){
-
-    suggested_vector_size_ =  1;
-
-#if defined( KOKKOS_ENABLE_SERIAL )
-  if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
-    suggested_vector_size_ =  1;
-  }
-#endif
-
-#if defined( KOKKOS_ENABLE_THREADS )
-  if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){
-    suggested_vector_size_ =  1;
-  }
-#endif
-
-#if defined( KOKKOS_ENABLE_OPENMP )
-  if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){
-    suggested_vector_size_ =  1;
-  }
-#endif
-
-#if defined( KOKKOS_ENABLE_CUDA )
-  if (std::is_same<Kokkos::Cuda, ExecutionSpace >::value){
-
-    suggested_vector_size_ = nnz / double (nr) + 0.5;
-
-    if (suggested_vector_size_ <= 3){
-      suggested_vector_size_ = 2;
-    }
-    else if (suggested_vector_size_ <= 6){
-      suggested_vector_size_ = 4;
-    }
-    else if (suggested_vector_size_ <= 12){
-      suggested_vector_size_ = 8;
-    }
-    else if (suggested_vector_size_ <= 24){
-      suggested_vector_size_ = 16;
-    }
-    else {
-      suggested_vector_size_ = 32;
-    }
-  }
-#endif
-
-#if defined( KOKKOS_ENABLE_QTHREAD)
-  if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
-    suggested_vector_size_ = 1;
-  }
-#endif
-
+    idx nr, idx nnz) {
+  suggested_vector_size_ = kk_get_suggested_vector_size(nr, nnz, get_exec_space_type<ExecutionSpace>());
 }
 
 //Get the best team size for the given functor.
@@ -152,34 +103,28 @@ void get_suggested_vector_size(
 template<typename team_policy_t, typename Functor, typename ParallelTag = Kokkos::ParallelForTag>
 int get_suggested_team_size(Functor& f, int vector_size)
 {
-#ifdef KOKKOS_ENABLE_CUDA
-  if(std::is_same<typename team_policy_t::traits::execution_space, Kokkos::Cuda>::value)
+  using execution_space = typename team_policy_t::traits::execution_space;
+  if(kk_is_gpu_exec_space<execution_space>())
   {
     team_policy_t temp(1, 1, vector_size);
     return temp.team_size_recommended(f, ParallelTag());
   }
   else
-#endif
-  {
     return 1;
-  }
 }
 
 template<typename team_policy_t, typename Functor, typename ParallelTag = Kokkos::ParallelForTag>
 int get_suggested_team_size(Functor& f, int vector_size, size_t sharedPerTeam, size_t sharedPerThread)
 {
-#ifdef KOKKOS_ENABLE_CUDA
-  if(std::is_same<typename team_policy_t::traits::execution_space, Kokkos::Cuda>::value)
+  using execution_space = typename team_policy_t::traits::execution_space;
+  if(kk_is_gpu_exec_space<execution_space>())
   {
     team_policy_t temp = team_policy_t(1, 1, vector_size).
       set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam), Kokkos::PerThread(sharedPerThread));
     return temp.team_size_recommended(f, ParallelTag());
   }
   else
-#endif
-  {
     return 1;
-  }
 }
 
 template <typename idx_array_type,
diff --git a/src/common/KokkosKernels_default_types.hpp b/src/common/KokkosKernels_default_types.hpp
index a83752b282..0b8724b794 100644
--- a/src/common/KokkosKernels_default_types.hpp
+++ b/src/common/KokkosKernels_default_types.hpp
@@ -82,6 +82,8 @@
 
 #if defined(KOKKOS_ENABLE_CUDA)
   typedef Kokkos::Cuda default_device;
+#elif defined(KOKKOS_ENABLE_HIP)
+  typedef Kokkos::Experimental::HIP default_device;
 #elif defined(KOKKOS_ENABLE_OPENMP)
   typedef Kokkos::OpenMP default_device;
 #elif defined(KOKKOS_ENABLE_PTHREAD) || defined(KOKKOS_ENABLE_THREADS)
diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp
index ca50a4f891..e85412abb6 100644
--- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp
+++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp
@@ -109,7 +109,7 @@ class GraphColoringHandle
   typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace> nnz_lno_persistent_work_view_t;
   typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type
 
-  typedef Kokkos::TeamPolicy<HandleExecSpace> team_policy_t ;
+  typedef Kokkos::TeamPolicy<ExecutionSpace> team_policy_t ;
   typedef typename team_policy_t::member_type team_member_t ;
 
   typedef typename Kokkos::View<size_t *> non_const_1d_size_type_view_t;
@@ -229,54 +229,17 @@ class GraphColoringHandle
   }
 
 
-  /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise.
+  /** \brief Chooses best algorithm based on the execution space. COLORING_SERIAL if serial, otherwise COLORING_VBBIT.
+   *         VBBIT is the fastest parallel algorithm (unless on GPU and the graph's maximum degree is very large, but
+   *         we don't have information about the graph here)
    */
   void choose_default_algorithm()
   {
-#if defined( KOKKOS_ENABLE_SERIAL )
-    if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
+    auto exec = KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>();
+    if(exec == KokkosKernels::Impl::Exec_SERIAL)
       this->coloring_algorithm_type = COLORING_SERIAL;
-#ifdef VERBOSE
-      std::cout << "Serial Execution Space, Default Algorithm: COLORING_VB" << std::endl;
-#endif
-    }
-#endif
-
-#if defined( KOKKOS_ENABLE_THREADS )
-    if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){
-      this->coloring_algorithm_type = COLORING_VB;
-#ifdef VERBOSE
-      std::cout << "PTHREAD Execution Space, Default Algorithm: COLORING_VB" << std::endl;
-#endif
-    }
-#endif
-
-#if defined( KOKKOS_ENABLE_OPENMP )
-    if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){
-      this->coloring_algorithm_type = COLORING_VB;
-#ifdef VERBOSE
-      std::cout << "OpenMP Execution Space, Default Algorithm: COLORING_VB" << std::endl;
-#endif
-    }
-#endif
-
-#if defined( KOKKOS_ENABLE_CUDA )
-    if (std::is_same<Kokkos::Cuda, ExecutionSpace >::value){
-      this->coloring_algorithm_type = COLORING_EB;
-#ifdef VERBOSE
-      std::cout << "Cuda Execution Space, Default Algorithm: COLORING_VB" << std::endl;
-#endif
-    }
-#endif
-
-#if defined( KOKKOS_ENABLE_QTHREAD)
-    if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
-      this->coloring_algorithm_type = COLORING_VB;
-#ifdef VERBOSE
-      std::cout << "Qthread Execution Space, Default Algorithm: COLORING_VB" << std::endl;
-#endif
-    }
-#endif
+    else
+      this->coloring_algorithm_type = COLORING_VBBIT;
   }
 
   template<typename v1, typename v2, typename v3>
@@ -463,7 +426,7 @@ class GraphColoringHandle
       row_index_view_type xadj, nonzero_view_type adj){
 
     KokkosKernels::Impl::symmetrize_and_get_lower_diagonal_edge_list
-    <row_index_view_type, nonzero_view_type, nnz_lno_persistent_work_view_t, HandleExecSpace>
+    <row_index_view_type, nonzero_view_type, nnz_lno_persistent_work_view_t, ExecutionSpace>
       (
         nv,
         xadj,
@@ -496,13 +459,8 @@ class GraphColoringHandle
 
       size_type_temp_work_view_t lower_count("LowerXADJ", nv + 1);
       size_type new_num_edge = 0;
-      typedef Kokkos::RangePolicy<HandleExecSpace> my_exec_space;
-
-      if ( false
-#if defined( KOKKOS_ENABLE_CUDA )
-          || std::is_same<Kokkos::Cuda, ExecutionSpace >::value
-#endif
-         )
+      typedef Kokkos::RangePolicy<ExecutionSpace> my_exec_space;
+      if (KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>())
       {
 
 
@@ -522,10 +480,10 @@ class GraphColoringHandle
             clt//, new_num_edge
         );
 
-        KokkosKernels::Impl::inclusive_parallel_prefix_sum<size_type_temp_work_view_t, HandleExecSpace>
+        KokkosKernels::Impl::inclusive_parallel_prefix_sum<size_type_temp_work_view_t, ExecutionSpace>
         (nv+1, lower_count);
         //Kokkos::parallel_scan (my_exec_space(0, nv + 1), PPS<row_lno_temp_work_view_t>(lower_count));
-        HandleExecSpace().fence();
+        ExecutionSpace().fence();
         auto lower_total_count = Kokkos::subview(lower_count, nv);
         auto hlower = Kokkos::create_mirror_view (lower_total_count);
         Kokkos::deep_copy (hlower, lower_total_count);
@@ -551,7 +509,7 @@ class GraphColoringHandle
 
         //Kokkos::parallel_scan (my_exec_space(0, nv + 1), PPS<row_lno_temp_work_view_t>(lower_count));
 
-        KokkosKernels::Impl::inclusive_parallel_prefix_sum<size_type_temp_work_view_t, HandleExecSpace>
+        KokkosKernels::Impl::inclusive_parallel_prefix_sum<size_type_temp_work_view_t, ExecutionSpace>
         (nv+1, lower_count);
         nnz_lno_persistent_work_view_t half_src (Kokkos::ViewAllocateWithoutInitializing("HALF SRC"),new_num_edge);
         nnz_lno_persistent_work_view_t half_dst (Kokkos::ViewAllocateWithoutInitializing("HALF DST"),new_num_edge);
diff --git a/src/graph/KokkosGraph_Distance2ColorHandle.hpp b/src/graph/KokkosGraph_Distance2ColorHandle.hpp
index f4624f545b..4c392051fb 100644
--- a/src/graph/KokkosGraph_Distance2ColorHandle.hpp
+++ b/src/graph/KokkosGraph_Distance2ColorHandle.hpp
@@ -198,71 +198,17 @@ class GraphColorDistance2Handle
      * Chooses best algorithm based on the execution space.
      *
      * This chooses the best algorithm based on the execution space:
-     * - COLORING_D2_SERIAL if the execution space is SERIAL
-     * - COLORING_D2_NB_BIT otherwise
+     * - COLORING_D2_SERIAL if the execution space is SERIAL (more work efficient than NB_BIT)
+     * - COLORING_D2_NB_BIT otherwise (fastest parallel algorithm)
      *
      */
 
     void choose_default_algorithm()
     {
-        bool found = false;
-#if defined(KOKKOS_ENABLE_SERIAL)
-        if(std::is_same<Kokkos::Serial, ExecutionSpace>::value)
-        {
+        if(KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>() == KokkosKernels::Impl::Exec_SERIAL)
             this->coloring_algorithm_type = COLORING_D2_SERIAL;
-            found = true;
-#ifdef VERBOSE
-            std::cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL" << std::endl;
-#endif
-        }
-#endif
-
-#if defined(KOKKOS_ENABLE_THREADS)
-        if(std::is_same<Kokkos::Threads, ExecutionSpace>::value)
-        {
-            this->coloring_algorithm_type = COLORING_D2_NB_BIT;
-            found = true;
-#ifdef VERBOSE
-            std::cout << "PTHREAD Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl;
-#endif
-        }
-#endif
-
-#if defined(KOKKOS_ENABLE_OPENMP)
-        if(std::is_same<Kokkos::OpenMP, ExecutionSpace>::value)
-        {
-            this->coloring_algorithm_type = COLORING_D2_NB_BIT;
-            found = true;
-#ifdef VERBOSE
-            std::cout << "OpenMP Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl;
-#endif
-        }
-#endif
-
-#if defined(KOKKOS_ENABLE_CUDA)
-        if(std::is_same<Kokkos::Cuda, ExecutionSpace>::value)
-        {
-            this->coloring_algorithm_type = COLORING_D2_NB_BIT;
-            found = true;
-#ifdef VERBOSE
-            std::cout << "Cuda Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl;
-#endif
-        }
-#endif
-
-#if defined(KOKKOS_ENABLE_QTHREAD)
-        if(std::is_same<Kokkos::Qthread, ExecutionSpace>::value)
-        {
+        else
             this->coloring_algorithm_type = COLORING_D2_NB_BIT;
-            found = true;
-#ifdef VERBOSE
-            std::cout << "Qthread Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl;
-#endif
-        }
-#endif
-        //Since this logic is based on checking every exec space, detect when a new one needs to be supported
-        if(!found)
-          throw std::logic_error("D2 coloring: default algorithm hasn't been chosen for the current execution space");
     }
 
 
diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp
index 0a5493df7d..866ad54daf 100644
--- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp
+++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp
@@ -396,7 +396,7 @@ struct D2_MIS_RandomPriority
     Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(colWorklist));
     worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2);
     auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type<exec_space>();
-    bool useTeams = (execSpaceEnum == KokkosKernels::Impl::Exec_CUDA) && (entries.extent(0) / numVerts >= 16);
+    bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space<exec_space>() && (entries.extent(0) / numVerts >= 16);
     int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum);
     int round = 0;
     lno_t rowWorkLen = numVerts;
diff --git a/src/sparse/KokkosSparse_CrsMatrix.hpp b/src/sparse/KokkosSparse_CrsMatrix.hpp
index 938d6e91be..c618d3add6 100644
--- a/src/sparse/KokkosSparse_CrsMatrix.hpp
+++ b/src/sparse/KokkosSparse_CrsMatrix.hpp
@@ -104,6 +104,12 @@ inline int RowsPerThread<Kokkos::Cuda>(const int NNZPerRow) {
   return 1;
 }
 #endif
+#ifdef KOKKOS_ENABLE_HIP
+template<>
+inline int RowsPerThread<Kokkos::HIP>(const int NNZPerRow) {
+  return 1;
+}
+#endif
 
 // A simple struct for storing a kernel launch configuration.
 // This is currently used by CrsMatrix to allow the user to have some control
diff --git a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp
index 2def3a17f1..fd4a9b58d9 100644
--- a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp
+++ b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp
@@ -274,53 +274,11 @@ namespace KokkosSparse{
     void set_block_size(nnz_lno_t bs){this->block_size = bs; }
     nnz_lno_t get_block_size() const {return this->block_size;}
 
-    /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise.
-     */
     void choose_default_algorithm(){
-#if defined( KOKKOS_ENABLE_SERIAL )
-      if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
-        this->algorithm_type = GS_PERMUTED;
-#ifdef VERBOSE
-        std::cout << "Serial Execution Space, Default Algorithm: GS_PERMUTED" << std::endl;
-#endif
-      }
-#endif
-
-#if defined( KOKKOS_ENABLE_THREADS )
-      if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){
-        this->algorithm_type = GS_PERMUTED;
-#ifdef VERBOSE
-        std::cout << "PTHREAD Execution Space, Default Algorithm: GS_PERMUTED" << std::endl;
-#endif
-      }
-#endif
-
-#if defined( KOKKOS_ENABLE_OPENMP )
-      if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){
-        this->algorithm_type = GS_PERMUTED;
-#ifdef VERBOSE
-        std::cout << "OpenMP Execution Space, Default Algorithm: GS_PERMUTED" << std::endl;
-#endif
-      }
-#endif
-
-#if defined( KOKKOS_ENABLE_CUDA )
-      if (std::is_same<Kokkos::Cuda, ExecutionSpace >::value){
+      if(KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>())
         this->algorithm_type = GS_TEAM;
-#ifdef VERBOSE
-        std::cout << "Cuda Execution Space, Default Algorithm: GS_TEAM" << std::endl;
-#endif
-      }
-#endif
-
-#if defined( KOKKOS_ENABLE_QTHREAD)
-      if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
+      else
         this->algorithm_type = GS_PERMUTED;
-#ifdef VERBOSE
-        std::cout << "Qthread Execution Space, Default Algorithm: GS_PERMUTED" << std::endl;
-#endif
-      }
-#endif
     }
 
     ~PointGaussSeidelHandle() = default;
@@ -559,33 +517,7 @@ namespace KokkosSparse{
     
     bool use_teams() const
     {
-      bool return_value = false;
-#if defined( KOKKOS_ENABLE_SERIAL )
-      if (std::is_same< Kokkos::Serial , ExecutionSpace >::value) {
-        return_value = false;
-      }
-#endif
-#if defined( KOKKOS_ENABLE_THREADS )
-      if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){
-        return_value = false;
-      }
-#endif
-#if defined( KOKKOS_ENABLE_OPENMP )
-      if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){
-        return_value = false;
-      }
-#endif
-#if defined( KOKKOS_ENABLE_CUDA )
-      if (std::is_same<Kokkos::Cuda, ExecutionSpace >::value){
-        return_value = true;
-      }
-#endif
-#if defined( KOKKOS_ENABLE_QTHREAD)
-      if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
-        return_value = false;
-      }
-#endif
-      return return_value;
+      return KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>();
     }
 
     ~ClusterGaussSeidelHandle() = default;
diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp
index 820afbbaa3..9ed66ce2ad 100644
--- a/src/sparse/KokkosSparse_spadd.hpp
+++ b/src/sparse/KokkosSparse_spadd.hpp
@@ -202,67 +202,6 @@ struct UnmergedSumFunctor {
   CcolindsT ABperm;
 };
 
-template <typename ExecSpace, typename size_type, typename ordinal_type,
-          typename CrowptrsT, typename CcolindsT>
-struct SortEntriesFunctor {
-  SortEntriesFunctor(const CrowptrsT& Crowptrs_, const CcolindsT& Ccolinds_,
-                     const CcolindsT& ABperm_)
-      : Crowptrs(Crowptrs_),
-        Ccolinds(Ccolinds_),
-        CcolindsAux("C colind aux", Ccolinds_.extent(0)),
-        ABperm(ABperm_),
-        ABpermAux("AB perm aux", ABperm_.extent(0)) {}
-  typedef typename Kokkos::TeamPolicy<ExecSpace>::member_type TeamMember;
-  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const {
-    // 3: Sort each row's colinds (permuting values at same time), then count
-    // unique colinds (write that to Crowptr(i)) CrowptrTemp tells how many
-    // entries in each oversized row
-    ordinal_type i       = t.league_rank();
-    size_type rowStart   = Crowptrs(i);
-    size_type rowEnd     = Crowptrs(i + 1);
-    size_type rowNum     = rowEnd - rowStart;
-    using lno_t          = typename CcolindsT::non_const_value_type;
-    using unsigned_lno_t = typename std::make_unsigned<lno_t>::type;
-    KokkosKernels::Impl::SerialRadixSort2<size_type, unsigned_lno_t, lno_t>(
-        (unsigned_lno_t*)Ccolinds.data() + rowStart,
-        (unsigned_lno_t*)CcolindsAux.data() + rowStart,
-        ABperm.data() + rowStart, ABpermAux.data() + rowStart, rowNum);
-  }
-  CrowptrsT Crowptrs;
-  CcolindsT Ccolinds;
-  CcolindsT CcolindsAux;
-  CcolindsT ABperm;
-  CcolindsT ABpermAux;
-};
-
-#ifdef KOKKOS_ENABLE_CUDA
-template <typename size_type, typename ordinal_type, typename CrowptrsT,
-          typename CcolindsT>
-struct SortEntriesFunctor<Kokkos::Cuda, size_type, ordinal_type, CrowptrsT,
-                          CcolindsT> {
-  SortEntriesFunctor(const CrowptrsT& Crowptrs_, CcolindsT& Ccolinds_,
-                     CcolindsT& ABperm_)
-      : Crowptrs(Crowptrs_), Ccolinds(Ccolinds_), ABperm(ABperm_) {}
-  typedef typename Kokkos::TeamPolicy<Kokkos::Cuda>::member_type TeamMember;
-  KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const {
-    // 3: Sort each row's colinds (permuting values at same time), then count
-    // unique colinds (write that to Crowptr(i)) CrowptrTemp tells how many
-    // entries in each oversized row
-    size_type i        = t.league_rank();
-    size_type rowStart = Crowptrs(i);
-    size_type rowEnd   = Crowptrs(i + 1);
-    size_type rowNum   = rowEnd - rowStart;
-    KokkosKernels::Impl::TeamBitonicSort2<
-        size_type, typename CcolindsT::non_const_value_type,
-        typename CcolindsT::non_const_value_type, TeamMember>(
-        Ccolinds.data() + rowStart, ABperm.data() + rowStart, rowNum, t);
-  }
-  CrowptrsT Crowptrs;
-  CcolindsT Ccolinds;
-  CcolindsT ABperm;
-};
-#endif
-
 template <typename size_type, typename ordinal_type, typename ArowptrsT,
           typename BrowptrsT, typename CrowptrsT, typename CcolindsT>
 struct MergeEntriesFunctor {
diff --git a/src/sparse/KokkosSparse_spgemm_handle.hpp b/src/sparse/KokkosSparse_spgemm_handle.hpp
index b34d349457..f517682d5e 100644
--- a/src/sparse/KokkosSparse_spgemm_handle.hpp
+++ b/src/sparse/KokkosSparse_spgemm_handle.hpp
@@ -504,8 +504,6 @@ class SPGEMMHandle{
     return this->cuSPARSEHandle;
   }
 #endif
-    /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise.
-   */
   void choose_default_algorithm(){
 #if defined( KOKKOS_ENABLE_SERIAL )
     if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
@@ -543,6 +541,15 @@ class SPGEMMHandle{
     }
 #endif
 
+#if defined( KOKKOS_ENABLE_HIP )
+    if (std::is_same<Kokkos::Experimental::HIP, ExecutionSpace >::value){
+      this->algorithm_type = SPGEMM_KK;
+#ifdef VERBOSE
+      std::cout << "HIP Execution Space, Default Algorithm: SPGEMM_KK" << std::endl;
+#endif
+    }
+#endif
+
 #if defined( KOKKOS_ENABLE_QTHREAD)
     if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
       this->algorithm_type = SPGEMM_SERIAL;
@@ -604,67 +611,20 @@ class SPGEMMHandle{
     //suggested_vector_size_=this->suggested_vector_size = 1;
     //return;
     if (this->suggested_team_size && this->suggested_vector_size) {
+      //already set in the handle
       suggested_vector_size_ = this->suggested_vector_size;
       suggested_team_size_ = this->suggested_team_size;
       return;
     }
 
-#if defined( KOKKOS_ENABLE_SERIAL )
-    if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
-      suggested_vector_size_ = this->suggested_vector_size = 1;
-      suggested_team_size_ = this->suggested_team_size = max_allowed_team_size;
-      return;
-    }
-#endif
-
-#if defined( KOKKOS_ENABLE_THREADS )
-    if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){
-      suggested_vector_size_ = this->suggested_vector_size = 1;
-      suggested_team_size_ = this->suggested_team_size = max_allowed_team_size;
-      return;
-    }
-#endif
-
-#if defined( KOKKOS_ENABLE_OPENMP )
-    if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){
-      suggested_vector_size_ = this->suggested_vector_size = 1;
-      suggested_team_size_ = this->suggested_team_size = max_allowed_team_size;
-    }
-#endif
-
-#if defined( KOKKOS_ENABLE_CUDA )
-    if (std::is_same<Kokkos::Cuda, ExecutionSpace >::value){
-
-      this->suggested_vector_size = nnz / double (nr) + 0.5;
-
-      if (this->suggested_vector_size <= 3){
-        this->suggested_vector_size = 2;
-      }
-      else if (this->suggested_vector_size <= 6){
-        this->suggested_vector_size = 4;
-      }
-      else if (this->suggested_vector_size <= 12){
-        this->suggested_vector_size = 8;
-      }
-      else if (this->suggested_vector_size <= 24){
-        this->suggested_vector_size = 16;
-      }
-      else {
-        this->suggested_vector_size = 32;
-      }
-
-      suggested_vector_size_ = this->suggested_vector_size;
-      this->suggested_team_size= suggested_team_size_ = max_allowed_team_size / this->suggested_vector_size;
-    }
-#endif
-
-#if defined( KOKKOS_ENABLE_QTHREAD)
-    if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){
-      suggested_vector_size_ = this->suggested_vector_size = 1;
-      suggested_team_size_ = this->suggested_team_size = max_allowed_team_size;
-    }
-#endif
-
+    //otherwise, recompute team_size/vector_size based on heuristic and save them in the handle
+    suggested_vector_size_ = KokkosKernels::Impl::kk_get_suggested_vector_size(nr, nnz, KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>());
+    if(KokkosKernels::Impl::kk_is_gpu_exec_space<ExecutionSpace>())
+      suggested_team_size_ = max_allowed_team_size / suggested_vector_size_;
+    else
+      suggested_team_size = max_allowed_team_size;
+    this->suggested_vector_size = suggested_vector_size_;
+    this->suggested_team_size = suggested_vector_size_;
   }
 
   void set_compression_steps(bool isCompressionSingleStep){
diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
index 03eef00e4d..d956ed8d4d 100644
--- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
@@ -554,9 +554,8 @@ namespace KokkosSparse{
                     }
                   });
 
-#if !defined(__CUDA_ARCH__)
 #if KOKKOSSPARSE_IMPL_PRINTDEBUG
-                  if (/*i == 0 && ii == 1*/ ii == 0 || (block_size == 1 && ii < 2) ){
+                  if (!KokkosKernels::Impl::kk_is_gpu_exec_space<typename team_member_t::execution_space>() && (ii == 0 || (block_size == 1 && ii < 2))){
                     std::cout << "\n\n\nrow:" << ii * block_size + i;
                     std::cout << "\nneighbors:";
                     for (nnz_lno_t z = 0; z < block_row_size; ++z){
@@ -573,7 +572,6 @@ namespace KokkosSparse{
 
                     std::cout << std::endl << "block_row_index:" << ii * block_size + i <<  " _Xvector(block_row_index):" << _Xvector(ii * block_size + i, vec) << std::endl << std::endl<< std::endl;
                   }
-#endif
 #endif
                   //row_begin += row_size * block_size;
                 }
@@ -737,31 +735,16 @@ namespace KokkosSparse{
         timer.reset();
 #endif
 
-
-#if defined( KOKKOS_ENABLE_CUDA )
-        if (std::is_same<Kokkos::Cuda, MyExecSpace >::value){
-          for (nnz_lno_t i = 0; i < numColors; ++i){
-            nnz_lno_t color_index_begin = h_color_xadj(i);
-            nnz_lno_t color_index_end = h_color_xadj(i + 1);
-
-            if (color_index_begin + 1 >= color_index_end ) continue;
-            auto colorsubset =
-              subview(color_adj, Kokkos::pair<row_lno_t, row_lno_t> (color_index_begin, color_index_end));
-            MyExecSpace().fence();
-            Kokkos::sort (colorsubset);
-            //TODO: MD 08/2017: If I remove the below fence, code fails on cuda.
-            //I do not see any reason yet it to fail.
-            MyExecSpace().fence();
-          }
-        }
-#endif
-
-        MyExecSpace().fence();
+        // TODO BMK: Why are the vertices in each color set only being sorted on GPU?
+        // Wouldn't it have a locality benefit on CPU too?
+        if(KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
+          KokkosKernels::Impl::sort_crs_graph<MyExecSpace, decltype(color_xadj), decltype(color_adj)>(color_xadj, color_adj);
+          MyExecSpace().fence();
 #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE
-        std::cout << "SORT_TIME:" << timer.seconds() << std::endl;
-        timer.reset();
-        //std::cout << "sort" << std::endl;
+          std::cout << "SORT_TIME:" << timer.seconds() << std::endl;
+          timer.reset();
 #endif
+        }
 
         row_lno_persistent_work_view_t permuted_xadj ("new xadj", num_rows + 1);
         nnz_lno_persistent_work_view_t old_to_new_map ("old_to_new_index_", num_rows );
@@ -844,7 +827,7 @@ namespace KokkosSparse{
           nnz_lno_t num_big_rows = 0;
 
           KokkosKernels::Impl::ExecSpaceType ex_sp = this->handle->get_handle_exec_space();
-          if (ex_sp != KokkosKernels::Impl::Exec_CUDA){
+          if (!KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
             //again, if it is on CPUs, we make L1 as big as we need.
             size_t l1mem = 1;
             while(l1mem < level_1_mem){
@@ -882,12 +865,11 @@ namespace KokkosSparse{
               num_big_rows = KOKKOSKERNELS_MACRO_MIN(num_large_rows, (size_type)(MyExecSpace::concurrency() / suggested_vector_size));
               //std::cout << "num_big_rows:" << num_big_rows << std::endl;
 
-#if defined( KOKKOS_ENABLE_CUDA )
-              if (ex_sp == KokkosKernels::Impl::Exec_CUDA) {
+              if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
                 //check if we have enough memory for this. lower the concurrency if we do not have enugh memory.
                 size_t free_byte ;
                 size_t total_byte ;
-                cudaMemGetInfo( &free_byte, &total_byte ) ;
+                KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
                 size_t required_size = size_t (num_big_rows) * level_2_mem;
                 if (required_size + num_big_rows * sizeof(int) > free_byte){
                   num_big_rows = ((((free_byte - num_big_rows * sizeof(int))* 0.8) /8 ) * 8) / level_2_mem;
@@ -900,7 +882,6 @@ namespace KokkosSparse{
                   num_big_rows = min_chunk_size;
                 }
               }
-#endif
             }
           }
 
@@ -1165,7 +1146,7 @@ namespace KokkosSparse{
           // change fill_matrix_numeric so that they store the internal matrix as above.
           // the rest will wok fine.
 
-          if (this->handle->get_handle_exec_space() == KokkosKernels::Impl::Exec_CUDA){
+          if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
             Kokkos::parallel_for( "KokkosSparse::GaussSeidel::Team_fill_matrix_numeric",
                                   team_policy_t(num_rows / rows_per_team + 1 , suggested_team_size, suggested_vector_size),
                                   fill_matrix_numeric(
@@ -1209,7 +1190,7 @@ namespace KokkosSparse{
                                      block_size,
                                      block_matrix_size);
 
-            if (this->handle->get_handle_exec_space() == KokkosKernels::Impl::Exec_CUDA || block_size > 1){
+            if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>() || block_size > 1){
               Kokkos::parallel_for("KokkosSparse::GaussSeidel::team_get_matrix_diagonals",
                                    team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size),
                                    gmd );
diff --git a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp
index 0ef887d80e..af10787c46 100644
--- a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp
@@ -74,535 +74,6 @@ struct IotaFunctor
   View v;
 };
 
-template <typename HandleType, typename lno_row_view_t, typename lno_nnz_view_t>
-struct RCM
-{
-  typedef typename HandleType::HandleExecSpace MyExecSpace;
-  typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace;
-  typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace;
-
-  typedef typename HandleType::size_type size_type;
-  typedef typename HandleType::nnz_lno_t nnz_lno_t;
-
-  typedef typename lno_row_view_t::const_type const_lno_row_view_t;
-  typedef typename lno_row_view_t::non_const_type non_const_lno_row_view_t;
-  typedef typename non_const_lno_row_view_t::value_type offset_t;
-
-  typedef typename lno_nnz_view_t::const_type const_lno_nnz_view_t;
-  typedef typename lno_nnz_view_t::non_const_type non_const_lno_nnz_view_t;
-
-  typedef typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t;
-  typedef typename HandleType::row_lno_persistent_work_view_t row_lno_persistent_work_view_t;
-  typedef typename HandleType::row_lno_persistent_work_host_view_t row_lno_persistent_work_host_view_t; //Host view type
-
-  typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t;
-  typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t;
-  typedef typename HandleType::nnz_lno_persistent_work_host_view_t nnz_lno_persistent_work_host_view_t; //Host view type
-
-  typedef nnz_lno_persistent_work_view_t nnz_view_t;
-  typedef Kokkos::View<nnz_lno_t, MyTempMemorySpace, Kokkos::MemoryTraits<0>> single_view_t;
-  typedef Kokkos::View<nnz_lno_t, Kokkos::HostSpace, Kokkos::MemoryTraits<0>> single_view_host_t;
-
-  typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
-
-  typedef Kokkos::Device<MyExecSpace, MyTempMemorySpace> device_t;
-
-  typedef Kokkos::RangePolicy<MyExecSpace> range_policy_t ;
-  typedef Kokkos::TeamPolicy<MyExecSpace> team_policy_t ;
-  typedef typename team_policy_t::member_type team_member_t ;
-
-  typedef nnz_lno_t LO;
-
-  RCM(size_type numRows_, lno_row_view_t& rowmap_, lno_nnz_view_t& colinds_)
-    : numRows(numRows_), rowmap(rowmap_), colinds(colinds_)
-  {}
-
-  nnz_lno_t numRows;
-  const_lno_row_view_t rowmap;
-  const_lno_nnz_view_t colinds;
-
-  //radix sort keys according to their corresponding values ascending.
-  //keys are NOT preserved since the use of this in RCM doesn't care about degree after sorting
-  template<typename size_type, typename KeyType, typename ValueType, typename IndexType, typename member_t>
-  KOKKOS_INLINE_FUNCTION static void
-  radixSortKeysAndValues(KeyType* keys, KeyType* keysAux, ValueType* values, ValueType* valuesAux, IndexType n, const member_t& mem)
-  {
-    if(n <= 1)
-      return;
-    //sort 4 bits at a time
-    KeyType mask = 0xF;
-    bool inAux = false;
-    //maskPos counts the low bit index of mask (0, 4, 8, ...)
-    IndexType maskPos = 0;
-    IndexType sortBits = 0;
-    KeyType minKey = Kokkos::ArithTraits<KeyType>::max();
-    KeyType maxKey = 0;
-    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(mem, n),
-    [=](size_type i, KeyType& lminkey)
-    {
-      if(keys[i] < lminkey)
-        lminkey = keys[i];
-    }, Kokkos::Min<KeyType>(minKey));
-    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(mem, n),
-    [=](size_type i, KeyType& lmaxkey)
-    {
-      if(keys[i] > lmaxkey)
-        lmaxkey = keys[i];
-    }, Kokkos::Max<KeyType>(maxKey));
-    //apply a bias so that key range always starts at 0
-    //also invert key values here for a descending sort
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(mem, n),
-    [=](size_type i)
-    {
-      keys[i] -= minKey;
-    });
-    KeyType upperBound = maxKey - minKey;
-    while(upperBound)
-    {
-      upperBound >>= 1;
-      sortBits++;
-    }
-    for(IndexType s = 0; s < (sortBits + 3) / 4; s++)
-    {
-      //Count the number of elements in each bucket
-      IndexType count[16] = {0};
-      IndexType offset[17];
-      if(!inAux)
-      {
-        for(IndexType i = 0; i < n; i++)
-        {
-          count[(keys[i] & mask) >> maskPos]++;
-        }
-      }
-      else
-      {
-        for(IndexType i = 0; i < n; i++)
-        {
-          count[(keysAux[i] & mask) >> maskPos]++;
-        }
-      }
-      offset[0] = 0;
-      //get offset as the prefix sum for count
-      for(IndexType i = 0; i < 16; i++)
-      {
-        offset[i + 1] = offset[i] + count[i];
-      }
-      //now for each element in [lo, hi), move it to its offset in the other buffer
-      //this branch should be ok because whichBuf is the same on all threads
-      if(!inAux)
-      {
-        //copy from *Over to *Aux
-        for(IndexType i = 0; i < n; i++)
-        {
-          IndexType bucket = (keys[i] & mask) >> maskPos;
-          keysAux[offset[bucket + 1] - count[bucket]] = keys[i];
-          valuesAux[offset[bucket + 1] - count[bucket]] = values[i];
-          count[bucket]--;
-        }
-      }
-      else
-      {
-        //copy from *Aux to *Over
-        for(IndexType i = 0; i < n; i++)
-        {
-          IndexType bucket = (keysAux[i] & mask) >> maskPos;
-          keys[offset[bucket + 1] - count[bucket]] = keysAux[i];
-          values[offset[bucket + 1] - count[bucket]] = valuesAux[i];
-          count[bucket]--;
-        }
-      }
-      inAux = !inAux;
-      mask = mask << 4;
-      maskPos += 4;
-    }
-    //move keys/values back from aux if they are currently in aux,
-    //and remove bias
-    if(inAux)
-    {
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(mem, n),
-      [=](size_type i)
-      {
-        //TODO: when everything works, is safe to remove next line
-        //since keys (BFS visit scores) will never be needed again
-        keys[i] = keysAux[i];
-        values[i] = valuesAux[i];
-      });
-    }
-  }
-
-  //Functor that does breadth-first search on a sparse graph.
-  struct BfsFunctor
-  {
-    typedef Kokkos::View<nnz_lno_t**, MyTempMemorySpace, Kokkos::MemoryTraits<0>> WorkView;
-
-    BfsFunctor(const WorkView& workQueue_, const WorkView& scratch_, const nnz_view_t& visit_, const const_lno_row_view_t& rowmap_, const const_lno_nnz_view_t& colinds_, const single_view_t& numLevels_, const nnz_view_t& threadNeighborCounts_, nnz_lno_t start_, nnz_lno_t numRows_)
-      : workQueue(workQueue_), scratch(scratch_), visit(visit_), rowmap(rowmap_), colinds(colinds_), numLevels(numLevels_), threadNeighborCounts(threadNeighborCounts_), start(start_), numRows(numRows_)
-    {}
-
-    KOKKOS_INLINE_FUNCTION void operator()(const team_member_t mem) const
-    {
-      const nnz_lno_t LNO_MAX = Kokkos::ArithTraits<nnz_lno_t>::max();
-      const nnz_lno_t NOT_VISITED = LNO_MAX;
-      const nnz_lno_t QUEUED = NOT_VISITED - 1;
-      int nthreads = mem.team_size();
-      nnz_lno_t tid = mem.team_rank();
-      auto neighborList = Kokkos::subview(scratch, tid, Kokkos::ALL());
-      //active and next indicate which buffer in workQueue holds the nodes in current/next frontiers, respectively
-      //active, next and visitCounter are thread-local, but always kept consistent across threads
-      int active = 0;
-      int next = 1;
-      nnz_lno_t visitCounter = 0;
-      Kokkos::single(Kokkos::PerTeam(mem),
-      [&]()
-      {
-        workQueue(active, 0) = start;
-        visit(start) = QUEUED;
-      });
-      nnz_lno_t activeQSize = 1;
-      nnz_lno_t nextQSize = 0;
-      //KK create_reverse_map() expects incoming values to start at 1
-      nnz_lno_t level = 1;
-      //do this until all nodes have been visited and added to a level
-      while(visitCounter < numRows)
-      {
-        mem.team_barrier();
-        //each thread works on a contiguous block of nodes in queue (for locality)
-        //compute in size_t to avoid possible 32-bit overflow
-        nnz_lno_t workStart = tid * activeQSize / nthreads;
-        nnz_lno_t workEnd = (tid + 1) * activeQSize / nthreads;
-        //the maximum work batch size (among all threads)
-        //the following loop contains barriers so all threads must iterate same # of times
-        nnz_lno_t maxBatch = (activeQSize + nthreads - 1) / nthreads;
-        for(nnz_lno_t loop = 0; loop < maxBatch; loop++)
-        {
-          //this thread may not actually have anything to work on (if nthreads doesn't divide qSize)
-          bool busy = loop < workEnd - workStart;
-          nnz_lno_t neiCount = 0;
-          nnz_lno_t process = LNO_MAX;
-          if(busy)
-          {
-            process = workQueue(active, workStart + loop);
-            offset_t rowStart = rowmap(process);
-            offset_t rowEnd = rowmap(process + 1);
-            //build a list of all non-visited neighbors
-            for(offset_t j = rowStart; j < rowEnd; j++)
-            {
-              nnz_lno_t col = colinds(j);
-              //use atomic here to guarantee neighbors are added to neighborList exactly once
-              if(col < numRows && Kokkos::atomic_compare_exchange_strong<nnz_lno_t>(&visit(col), NOT_VISITED, QUEUED))
-              {
-                //this thread is the first to see that col needs to be queued
-                neighborList(neiCount) = col;
-                neiCount++;
-              }
-            }
-          }
-          threadNeighborCounts(tid) = neiCount;
-          mem.team_barrier();
-          size_type queueUpdateOffset = 0;
-          for(nnz_lno_t i = 0; i < tid; i++)
-          {
-            queueUpdateOffset += threadNeighborCounts(i);
-          }
-          //write out all updates to next queue in parallel
-          if(busy)
-          {
-            nnz_lno_t nextQueueIter = 0;
-            for(nnz_lno_t i = 0; i < neiCount; i++)
-            {
-              nnz_lno_t toQueue = neighborList(i);
-              visit(toQueue) = QUEUED;
-              workQueue(next, nextQSize + queueUpdateOffset + nextQueueIter) = toQueue;
-              nextQueueIter++;
-            }
-            //assign level to to process
-            visit(process) = level;
-          }
-          nnz_lno_t totalAdded = 0;
-          for(nnz_lno_t i = 0; i < nthreads; i++)
-          {
-            totalAdded += threadNeighborCounts(i);
-          }
-          nextQSize += totalAdded;
-          mem.team_barrier();
-        }
-        //swap queue buffers
-        active = next;
-        next = 1 - next;
-        //all threads have a consistent value of qSize here.
-        //update visitCounter in preparation for next frontier
-        visitCounter += activeQSize;
-        activeQSize = nextQSize;
-        nextQSize = 0;
-        if(visitCounter < numRows && activeQSize == 0)
-        {
-          Kokkos::single(Kokkos::PerTeam(mem),
-          [&]()
-          {
-            //Some nodes are unreachable from start (graph not connected)
-            //Find an unvisited node to resume BFS
-            for(nnz_lno_t search = numRows - 1; search >= 0; search--)
-            {
-              if(visit(search) == NOT_VISITED)
-              {
-                workQueue(active, 0) = search;
-                visit(search) = QUEUED;
-                break;
-              }
-            }
-          });
-          activeQSize = 1;
-        }
-        level++;
-      }
-      Kokkos::single(Kokkos::PerTeam(mem),
-      [&]
-      {
-        numLevels() = level - 1;
-      });
-    }
-
-    WorkView workQueue;
-    WorkView scratch;
-    nnz_view_t visit;
-    const_lno_row_view_t rowmap;
-    const_lno_nnz_view_t colinds;
-    single_view_t numLevels;
-    nnz_view_t threadNeighborCounts;
-    nnz_lno_t start;
-    nnz_lno_t numRows;
-  };
-
-  //Parallel breadth-first search, producing level structure in (xadj, adj) form:
-  //xadj(level) gives index in adj where level begins.
-  //Returns the total number of levels, and sets xadj, adj and maxDeg.
-  nnz_lno_t parallel_bfs(nnz_lno_t start, nnz_view_t& xadj, nnz_view_t& adj, nnz_lno_t& maxDeg, nnz_lno_t nthreads)
-  {
-    //need to know maximum degree to allocate scratch space for threads
-    maxDeg = KokkosKernels::Impl::graph_max_degree<device_t, nnz_lno_t, const_lno_row_view_t>(rowmap);
-    //view for storing the visit timestamps
-    nnz_view_t visit("BFS visited nodes", numRows);
-    const nnz_lno_t LNO_MAX = Kokkos::ArithTraits<nnz_lno_t>::max();
-    const nnz_lno_t NOT_VISITED = LNO_MAX;
-    KokkosBlas::fill(visit, NOT_VISITED);
-    //the visit queue
-    //one of q1,q2 is active at a time and holds the nodes to process in next BFS level
-    //elements which are LNO_MAX are just placeholders (nothing to process)
-    Kokkos::View<nnz_lno_t**, MyTempMemorySpace, Kokkos::MemoryTraits<0>> workQueue("BFS queue (double buffered)", 2, numRows);
-    nnz_view_t threadNeighborCounts("Number of nodes to queue on each thread", nthreads);
-    single_view_t numLevels("# of BFS levels");
-    single_view_host_t numLevelsHost("# of BFS levels");
-    Kokkos::View<nnz_lno_t**, MyTempMemorySpace, Kokkos::MemoryTraits<0u>> scratch("Scratch buffer shared by threads", nthreads, maxDeg);
-    Kokkos::parallel_for(team_policy_t(1, nthreads), BfsFunctor(workQueue, scratch, visit, rowmap, colinds, numLevels, threadNeighborCounts, start, numRows));
-    Kokkos::deep_copy(numLevelsHost, numLevels);
-    //now that level structure has been computed, construct xadj/adj
-    KokkosKernels::Impl::create_reverse_map<nnz_view_t, nnz_view_t, MyExecSpace>
-      (numRows, numLevelsHost(), visit, xadj, adj);
-    return numLevelsHost();
-  }
-
-  struct CuthillMcKeeFunctor
-  {
-    typedef Kokkos::View<offset_t*, MyTempMemorySpace, Kokkos::MemoryTraits<0u>> ScoreView;
-
-    CuthillMcKeeFunctor(nnz_lno_t numLevels_, nnz_lno_t maxDegree_, const const_lno_row_view_t& rowmap_, const const_lno_nnz_view_t& colinds_, const ScoreView& scores_, const ScoreView& scoresAux_, const nnz_view_t& visit_, const nnz_view_t& xadj_, const nnz_view_t& adj_, const nnz_view_t& adjAux_)
-      : numLevels(numLevels_), maxDegree(maxDegree_), rowmap(rowmap_), colinds(colinds_), scores(scores_), scoresAux(scoresAux_), visit(visit_), xadj(xadj_), adj(adj_), adjAux(adjAux_)
-    {
-      numRows = rowmap.extent(0) - 1;
-    }
-
-    KOKKOS_INLINE_FUNCTION void operator()(const team_member_t mem) const
-    {
-      int tid = mem.team_rank();
-      int nthreads = mem.team_size();
-      const nnz_lno_t LNO_MAX = Kokkos::ArithTraits<nnz_lno_t>::max();
-      nnz_lno_t visitCounter = 0;
-      for(nnz_lno_t level = 0; level < numLevels; level++)
-      {
-        //iterate over vertices in this level and compute
-        //min predecessors (minimum-labeled vertices from previous level)
-        nnz_lno_t levelOffset = xadj(level);
-        nnz_lno_t levelSize = xadj(level + 1) - levelOffset;
-        //compute as offset_t to avoid overflow, but the upper bound on
-        //the scores is approx. numRows * maxDegree, which should be representable
-        nnz_lno_t workStart = tid * levelSize / nthreads;
-        nnz_lno_t workEnd = (tid + 1) * levelSize / nthreads;
-        for(nnz_lno_t i = workStart; i < workEnd; i++)
-        {
-          nnz_lno_t process = adj(levelOffset + i);
-          nnz_lno_t minNeighbor = LNO_MAX;
-          offset_t rowStart = rowmap(process);
-          offset_t rowEnd = rowmap(process + 1);
-          for(offset_t j = rowStart; j < rowEnd; j++)
-          {
-            nnz_lno_t neighbor = colinds(j);
-            if(neighbor < numRows)
-            {
-              nnz_lno_t neighborVisit = visit(neighbor);
-              if(neighborVisit < minNeighbor)
-                minNeighbor = neighborVisit;
-            }
-          }
-          scores(i) = ((offset_t) minNeighbor * (maxDegree + 1)) + (rowmap(process + 1) - rowmap(process));
-        }
-        mem.team_barrier();
-        Kokkos::single(Kokkos::PerTeam(mem),
-        [&]()
-        {
-          radixSortKeysAndValues<size_type, offset_t, nnz_lno_t, nnz_lno_t, team_member_t>
-            (scores.data(), scoresAux.data(), adj.data() + levelOffset, adjAux.data(), levelSize, mem);
-        });
-        mem.team_barrier();
-        //label all vertices (which are now in label order within their level)
-        for(nnz_lno_t i = workStart; i < workEnd; i++)
-        {
-          nnz_lno_t process = adj(levelOffset + i);
-          //visit counter increases with levels, so flip the range for the "reverse" in RCM
-          visit(process) = visitCounter + i;
-        }
-        visitCounter += levelSize;
-      }
-    }
-
-    nnz_lno_t numRows;
-    nnz_lno_t numLevels;
-    nnz_lno_t maxDegree;
-    const_lno_row_view_t rowmap;
-    const_lno_nnz_view_t colinds;
-    ScoreView scores;
-    ScoreView scoresAux;
-    nnz_view_t visit;
-    //The levels, stored in CRS format.
-    //xadj stores offsets for each level, and adj stores the rows in each level.
-    nnz_view_t xadj;
-    nnz_view_t adj;
-    nnz_view_t adjAux;
-  };
-
-  //Does the reversing in "reverse Cuthill-McKee")
-  struct OrderReverseFunctor
-  {
-    OrderReverseFunctor(const nnz_view_t& visit_, nnz_lno_t numRows_)
-      : visit(visit_), numRows(numRows_)
-    {}
-
-    KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
-    {
-      visit(i) = numRows - visit(i) - 1;
-    }
-    nnz_view_t visit;
-    nnz_lno_t numRows;
-  };
-
-  //breadth-first search, producing a reverse Cuthill-McKee ordering
-  nnz_view_t parallel_cuthill_mckee(nnz_lno_t start)
-  {
-    size_type nthreads = MyExecSpace::concurrency();
-    if(nthreads > 64)
-      nthreads = 64;
-    #ifdef KOKKOS_ENABLE_CUDA
-    if(std::is_same<MyExecSpace, Kokkos::Cuda>::value)
-    {
-      nthreads = 256;
-    }
-    #endif
-    nnz_view_t xadj, adj;
-    nnz_lno_t maxDegree = 0;
-    //parallel_bfs will compute maxDegree
-    auto numLevels = parallel_bfs(start, xadj, adj, maxDegree, nthreads);
-    //xadj determines where each level set starts and begins,
-    //so its max 'degree' gives the size of the largest level
-    nnz_lno_t maxLevelSize = KokkosKernels::Impl::graph_max_degree<device_t, nnz_lno_t, nnz_view_t>(xadj);
-    std::cout << "Maximum size of a level set: " << maxLevelSize << '\n';
-    //visit (to be returned) contains the RCM numberings of each row
-    nnz_view_t visit("RCM labels", numRows);
-    //Populate visit wth LNO_MAX so that the "min-labeled neighbor"
-    //is always a node in the previous level
-    const nnz_lno_t LNO_MAX = Kokkos::ArithTraits<nnz_lno_t>::max();
-    KokkosBlas::fill(visit, LNO_MAX);
-    //the "score" of a node is a single value that provides an ordering equivalent
-    //to sorting by min predecessor and then by min degree
-    //reduce nthreads to be a power of 2
-    Kokkos::View<offset_t*, MyTempMemorySpace, Kokkos::MemoryTraits<0u>> scores("RCM scores for sorting", maxLevelSize);
-    Kokkos::View<offset_t*, MyTempMemorySpace, Kokkos::MemoryTraits<0u>> scoresAux("RCM scores for sorting (radix sort aux)", maxLevelSize);
-    nnz_view_t adjAux("RCM scores for sorting (radix sort aux)", maxLevelSize);
-    Kokkos::parallel_for(team_policy_t(1, nthreads), CuthillMcKeeFunctor(numLevels, maxDegree, rowmap, colinds, scores, scoresAux, visit, xadj, adj, adjAux));
-    //reverse the visit order (for the 'R' in RCM)
-    Kokkos::parallel_for(range_policy_t(0, numRows), OrderReverseFunctor(visit, numRows));
-    return visit;
-  }
-
-  template<typename Reducer>
-  struct MinDegreeRowFunctor
-  {
-    typedef typename Reducer::value_type Value;
-    MinDegreeRowFunctor(const const_lno_row_view_t& rowmap_) : rowmap(rowmap_) {}
-    KOKKOS_INLINE_FUNCTION void operator()(const size_type i, Value& lval) const
-    {
-      size_type ideg = rowmap(i + 1) - rowmap(i);
-      if(ideg < lval.val)
-      {
-        lval.val = ideg;
-        lval.loc = i;
-      }
-    }
-    const_lno_row_view_t rowmap;
-  };
-
-  //parallel-for functor that assigns a cluster given a envelope-reduced reordering (like RCM)
-  struct OrderToClusterFunctor
-  {
-    OrderToClusterFunctor(const nnz_view_t& ordering_, const nnz_view_t& vertClusters_, nnz_lno_t clusterSize_)
-      : ordering(ordering_), vertClusters(vertClusters_), clusterSize(clusterSize_)
-    {}
-
-    KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
-    {
-      vertClusters(i) = ordering(i) / clusterSize;
-    }
-
-    const nnz_view_t ordering;
-    nnz_view_t vertClusters;
-    nnz_lno_t clusterSize;
-  };
-
-  //Find a peripheral node (one of minimal degree), suitable for starting RCM or BFS 
-  nnz_lno_t find_peripheral()
-  {
-    typedef Kokkos::MinLoc<size_type, size_type> MinLocReducer;
-    typedef typename MinLocReducer::value_type MinLocVal;
-    MinLocVal v;
-    Kokkos::parallel_reduce(range_policy_t(0, numRows),
-        MinDegreeRowFunctor<MinLocReducer>(rowmap), MinLocReducer(v));
-    return v.loc;
-  }
-
-  nnz_view_t cuthill_mckee()
-  {
-    nnz_lno_t periph = find_peripheral();
-    //run Cuthill-McKee BFS from periph
-    auto ordering = parallel_cuthill_mckee(periph);
-    return ordering;
-  }
-
-  nnz_view_t rcm()
-  {
-    nnz_view_t cm = cuthill_mckee();
-    //reverse the visit order (for the 'R' in RCM)
-    Kokkos::parallel_for(range_policy_t(0, numRows), OrderReverseFunctor(cm, numRows));
-    return cm;
-  }
-
-  nnz_view_t cm_cluster(nnz_lno_t clusterSize)
-  {
-    nnz_view_t cm = cuthill_mckee();
-    nnz_view_t vertClusters("Vert to cluster", numRows);
-    OrderToClusterFunctor makeClusters(cm, vertClusters, clusterSize);
-    Kokkos::parallel_for(range_policy_t(0, numRows), makeClusters);
-    return vertClusters;
-  }
-};
-
 template <typename HandleType, typename lno_row_view_t, typename lno_nnz_view_t>
 struct BalloonClustering
 {
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
index 6d240d11b3..c881c98ed4 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
@@ -219,6 +219,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
 
@@ -761,6 +765,7 @@ bool KokkosSPGEMM
 {
   //get the execution space type.
   KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space();
+  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
   //get the suggested vectorlane size based on the execution space, and average number of nnzs per row.
   int suggested_vector_size = this->handle->get_suggested_vector_size(n, nnz);
   //get the suggested team size.
@@ -791,7 +796,7 @@ bool KokkosSPGEMM
   out_nnz_view_t set_nexts_;
   out_nnz_view_t set_begins_;
 #ifdef KOKKOSKERNELSMOREMEM
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (exec_gpu) {
     set_nexts_ = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_nexts_"), nnz);
     set_begins_ = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_begins_"), nnz);
     Kokkos::deep_copy (set_begins_, -1);
@@ -804,8 +809,9 @@ bool KokkosSPGEMM
   }
 
   //if compressing in single step, allocate the memory as upperbound.
-  //TODO: two step is not there for cuda.
-  if (compress_in_single_step || lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  //TODO: two step is not there for GPU.
+
+  if (compress_in_single_step || exec_gpu) {
     out_nnz_indices = out_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("set_entries_"), nnz);
     out_nnz_sets = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_indices_"), nnz);
   }
@@ -834,7 +840,8 @@ bool KokkosSPGEMM
 
   timer1.reset();
   //bool compression_applied = false;
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>()) {
+
 
 #ifndef KOKKOSKERNELSMOREMEM
     size_type max_row_nnz = 0;
@@ -856,27 +863,23 @@ bool KokkosSPGEMM
     size_t num_chunks = concurrency / suggested_vector_size;
 
 
-#if defined( KOKKOS_ENABLE_CUDA )
-	if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
-
-		size_t free_byte ;
-		size_t total_byte ;
-		cudaMemGetInfo( &free_byte, &total_byte ) ;
-		size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
-		if (KOKKOSKERNELS_VERBOSE)
-			std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-		if (required_size + num_chunks*sizeof(int) > free_byte){
-			num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize;
-		}
-		{
-			size_t min_chunk_size = 1;
-			while (min_chunk_size * 2 <= num_chunks) {
-				min_chunk_size *= 2;
-			}
-			num_chunks = min_chunk_size;
-		}
-	}
-#endif
+    if (exec_gpu) {
+      size_t free_byte, total_byte;
+      KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
+      size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
+      if (KOKKOSKERNELS_VERBOSE)
+              std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
+      if (required_size + num_chunks*sizeof(int) > free_byte){
+              num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize;
+      }
+      {
+              size_t min_chunk_size = 1;
+              while (min_chunk_size * 2 <= num_chunks) {
+                      min_chunk_size *= 2;
+              }
+              num_chunks = min_chunk_size;
+      }
+    }
     if (KOKKOSKERNELS_VERBOSE){
 
       std::cout << "\t\tPOOL chunksize:" << chunksize << " num_chunks:"
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
index aa73c1e55b..4924e11b0c 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
@@ -124,10 +124,9 @@ void KokkosSPGEMM
     KokkosKernels::Impl::ExecSpaceType my_exec_space_ = KokkosKernels::Impl::get_exec_space_type<MyExecSpace>();
 
     bool compress_in_single_step = this->handle->get_spgemm_handle()->get_compression_step();
-    //compress in single step if it is cuda execution space.
-    if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA) {
+    //compress in single step if it is GPU.
+    if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>())
     	compress_in_single_step = true;
-    }
 
     //compressed B fields.
     row_lno_temp_work_view_t new_row_mapB(Kokkos::ViewAllocateWithoutInitializing("new row map"), n+1);
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
index 3f29c39e4e..38fce91b1b 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
@@ -234,6 +234,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
   }
@@ -1244,7 +1248,7 @@ void
 
   //choose parameters
   if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){
-	  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+	  if (KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>()) {
 		  //then chose the best method and parameters.
 		  size_type average_row_nnz = overall_nnz / this->a_row_cnt;
 		  size_t average_row_flops = original_overall_flops / this->a_row_cnt;
@@ -1374,7 +1378,7 @@ void
 
 
   //required memory for L2
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>()) {
 
 	  if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){
 		  tmp_max_nnz = 1;
@@ -1419,12 +1423,9 @@ void
   }
   int num_chunks = concurrency / suggested_vector_size;
 
-#if defined( KOKKOS_ENABLE_CUDA )
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
-
-    size_t free_byte ;
-    size_t total_byte ;
-    cudaMemGetInfo( &free_byte, &total_byte ) ;
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<my_exec_space>()) {
+    size_t free_byte, total_byte;
+    KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
     size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
     if (KOKKOSKERNELS_VERBOSE)
       std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
@@ -1439,7 +1440,6 @@ void
       num_chunks = min_chunk_size;
     }
   }
-#endif
 
   // END SIZE CALCULATIONS FOR MEMORYPOOL
 
@@ -1455,7 +1455,7 @@ void
   KokkosKernels::Impl::PoolType my_pool_type =
       KokkosKernels::Impl::OneThread2OneChunk;
 
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
 
@@ -1505,7 +1505,7 @@ void
   }
   timer1.reset();
 
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
 	  if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){
                   if (thread_shmem_key_size <= 0) {
                     std::cout << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl;
@@ -1617,7 +1617,7 @@ void
 
   KokkosKernels::Impl::PoolType my_pool_type =
       KokkosKernels::Impl::OneThread2OneChunk;
-  if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<my_exec_space>()) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
 
@@ -1667,7 +1667,7 @@ void
   }
   timer1.reset();
 
-  if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<my_exec_space>()) {
     Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY2",  gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc);
     MyExecSpace().fence();
   }
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
index 415bd1ed3a..e3a4f492a6 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp
@@ -143,6 +143,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
   }
@@ -481,7 +485,7 @@ struct KokkosSPGEMM
 //
 //  Policy typedefs with tags found in: KokkosSparse_spgemm_impl.hpp
 //
-//  if Cuda enabled :
+//  if GPU:
 //    "KokkosSparse::NumericCMEM::KKSPEED::GPU" : gpu_team_policy_t,  i.e. GPUTag
 //
 //  else :
@@ -519,7 +523,7 @@ void
 
   Kokkos::Impl::Timer numeric_speed_timer_with_free;
 
-  if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){
+  if (KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>()) {
     //allocate memory for begins and next to be used by the hashmap
     nnz_lno_temp_work_view_t beginsC
     (Kokkos::ViewAllocateWithoutInitializing("C keys"), valuesC_.extent(0));
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
index 9f4f7ec753..29dbb5c477 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
@@ -210,6 +210,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
   }
@@ -785,6 +789,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
   }
@@ -1493,13 +1501,14 @@ void KokkosSPGEMM
   ){
 
 	SPGEMMAlgorithm current_spgemm_algorithm = this->spgemm_algorithm;
+        constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
 	KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space();
-	if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA)
+	if (exec_gpu)
 	{
 		current_spgemm_algorithm = SPGEMM_KK_MEMORY;
 	}
 	maxNumRoughNonzeros = KOKKOSKERNELS_MACRO_MIN(this->b_col_cnt, maxNumRoughNonzeros);
-    int shmem_size_to_use = shmem_size;
+        int shmem_size_to_use = shmem_size;
 
 	typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
 
@@ -1511,7 +1520,7 @@ void KokkosSPGEMM
 	int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz);
 
 	//this kernel does not really work well if the vector size is less than 4.
-	if (suggested_vector_size < 4 && lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+	if (suggested_vector_size < 4 && exec_gpu) {
 		if (KOKKOSKERNELS_VERBOSE){
 			std::cout << "\tsuggested_vector_size:" << suggested_vector_size << " setting it to 4 for Structure kernel" << std::endl;
 		}
@@ -1522,7 +1531,7 @@ void KokkosSPGEMM
 
 
 	if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){
-		if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+		if (exec_gpu){
 			//then chose the best method and parameters.
 			current_spgemm_algorithm = SPGEMM_KK_MEMORY;
 			int estimate_compress = 8;
@@ -1635,31 +1644,28 @@ void KokkosSPGEMM
 	//initizalize value for the mem pool
 	nnz_lno_t num_chunks = concurrency / suggested_vector_size;
 	KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
-	if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
+	if (exec_gpu) {
 		my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
 	}
 
 
-#if defined( KOKKOS_ENABLE_CUDA )
-	if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
-		size_t free_byte ;
-		size_t total_byte ;
-		cudaMemGetInfo( &free_byte, &total_byte ) ;
-		size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
-		if (KOKKOSKERNELS_VERBOSE)
-			std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-		if (required_size + num_chunks > free_byte){
-			num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize;
-		}
-		{
-			nnz_lno_t min_chunk_size = 1;
-			while (min_chunk_size * 2 <= num_chunks) {
-				min_chunk_size *= 2;
-			}
-			num_chunks = min_chunk_size;
-		}
+	if (exec_gpu) {
+            size_t free_byte, total_byte;
+            KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
+            size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
+            if (KOKKOSKERNELS_VERBOSE)
+                    std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
+            if (required_size + num_chunks > free_byte){
+                    num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize;
+            }
+            {
+                    nnz_lno_t min_chunk_size = 1;
+                    while (min_chunk_size * 2 <= num_chunks) {
+                            min_chunk_size *= 2;
+                    }
+                    num_chunks = min_chunk_size;
+            }
 	}
-#endif
 
 	if (KOKKOSKERNELS_VERBOSE){
 		std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl;
@@ -1705,8 +1711,8 @@ void KokkosSPGEMM
 	timer1.reset();
 
 
-	if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
-		Kokkos::parallel_for("StructureC_NC::CUDA_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc);
+	if (exec_gpu) {
+		Kokkos::parallel_for("StructureC_NC::GPU_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc);
 	}
 	else {
 		if (current_spgemm_algorithm == SPGEMM_KK_DENSE){
@@ -1791,8 +1797,9 @@ void KokkosSPGEMM
 ){
 
   SPGEMMAlgorithm current_spgemm_algorithm = this->spgemm_algorithm;
+  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<typename HandleType::HandleExecSpace>();
   KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space();
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (exec_gpu) {
 	current_spgemm_algorithm = SPGEMM_KK_MEMORY;
   }
 
@@ -1800,7 +1807,7 @@ void KokkosSPGEMM
   nnz_lno_t brows = row_mapB_.extent(0) - 1;
   size_type bnnz =  entriesSetIndex.extent(0);
   size_type compressed_b_size = bnnz;
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (exec_gpu) {
 	  KokkosKernels::Impl::kk_reduce_diff_view <b_original_row_view_t,
 	  	  	  	  	  	  	  	  	  	  	  	b_compressed_row_view_t, MyExecSpace> (brows, old_row_mapB, row_mapB_, compressed_b_size);
 	  if (KOKKOSKERNELS_VERBOSE){
@@ -1810,7 +1817,7 @@ void KokkosSPGEMM
   int suggested_vector_size = this->handle->get_suggested_vector_size(brows, compressed_b_size);
 
   //this kernel does not really work well if the vector size is less than 4.
-  if (suggested_vector_size < 4 && lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+  if (suggested_vector_size < 4 && exec_gpu) {
       if (KOKKOSKERNELS_VERBOSE){
         std::cout << "\tsuggested_vector_size:" << suggested_vector_size << " setting it to 4 for Structure kernel" << std::endl;
       }
@@ -1821,7 +1828,7 @@ void KokkosSPGEMM
   int shmem_size_to_use = shmem_size;
 
   if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){
-	  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+	  if (exec_gpu) {
 		  //then chose the best method and parameters.
 		  current_spgemm_algorithm = SPGEMM_KK_MEMORY;
 		  int estimate_compress = 8;
@@ -1951,7 +1958,7 @@ void KokkosSPGEMM
   }
 
 
-  if (current_spgemm_algorithm == SPGEMM_KK_DENSE && lcl_my_exec_space != KokkosKernels::Impl::Exec_CUDA){
+  if (current_spgemm_algorithm == SPGEMM_KK_DENSE && !exec_gpu) {
     nnz_lno_t col_size = this->b_col_cnt / (sizeof (nnz_lno_t) * 8)+ 1;
     nnz_lno_t max_row_size = KOKKOSKERNELS_MACRO_MIN(col_size, maxNumRoughNonzeros);
     chunksize = col_size + max_row_size;
@@ -1966,16 +1973,14 @@ void KokkosSPGEMM
   nnz_lno_t num_chunks = concurrency / suggested_vector_size;
 
   KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
+  if (exec_gpu) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
 
 
-#if defined( KOKKOS_ENABLE_CUDA )
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
-    size_t free_byte ;
-    size_t total_byte ;
-    cudaMemGetInfo( &free_byte, &total_byte ) ;
+  if (exec_gpu) {
+    size_t free_byte, total_byte;
+    KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
     size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
     if (KOKKOSKERNELS_VERBOSE)
       std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
@@ -1990,7 +1995,6 @@ void KokkosSPGEMM
       num_chunks = min_chunk_size;
     }
   }
-#endif
 
   if (KOKKOSKERNELS_VERBOSE){
     std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl;
@@ -2035,7 +2039,7 @@ void KokkosSPGEMM
 
   timer1.reset();
 
-  if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
+  if (exec_gpu) {
     Kokkos::parallel_for("KokkosSparse::StructureC::GPU_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc);
   }
   else {
@@ -2584,6 +2588,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
   }
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
index d8997fcc12..27c0f4c7d9 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
@@ -219,6 +219,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
   }
@@ -1322,17 +1326,17 @@ void KokkosSPGEMM
     ){
 
   bool apply_compression = this->handle->get_spgemm_handle()->get_compression();
+  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
 
   const nnz_lno_t * min_result_row_for_each_row = this->handle->get_spgemm_handle()->get_min_col_of_row().data();
   nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz();
 
   typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
 
-
   int suggested_vector_size = this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz);
 
   //this kernel does not really work well if the vector size is less than 4.
-  if (suggested_vector_size < 4 && MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){
+  if (suggested_vector_size < 4 && exec_gpu) {
     if (KOKKOSKERNELS_VERBOSE) std::cout << "\tVecSize:" << suggested_vector_size << " Setting it to 4" << std::endl;
     suggested_vector_size = 4;
   }
@@ -1414,29 +1418,27 @@ void KokkosSPGEMM
 
   nnz_lno_t num_chunks = concurrency / suggested_vector_size;
   KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
-  if (MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) {
+  if (exec_gpu) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
 
-
-#if defined( KOKKOS_ENABLE_CUDA )
-  size_t free_byte ;
-  size_t total_byte ;
-  cudaMemGetInfo( &free_byte, &total_byte ) ;
-  size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t);
-  if (KOKKOSKERNELS_VERBOSE)
-    std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-  if (required_size + num_chunks > free_byte){
-    num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize;
-  }
-  {
-    nnz_lno_t min_chunk_size = 1;
-    while (min_chunk_size * 2 < num_chunks) {
-      min_chunk_size *= 2;
+  if(exec_gpu) {
+    size_t free_byte, total_byte;
+    KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
+    size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t);
+    if (KOKKOSKERNELS_VERBOSE)
+      std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
+    if (required_size + num_chunks > free_byte){
+      num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize;
+    }
+    {
+      nnz_lno_t min_chunk_size = 1;
+      while (min_chunk_size * 2 < num_chunks) {
+        min_chunk_size *= 2;
+      }
+      num_chunks = min_chunk_size;
     }
-    num_chunks = min_chunk_size;
   }
-#endif
   if (KOKKOSKERNELS_VERBOSE){
     std::cout <<  "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. <<
         " num_chunks:" << num_chunks <<
@@ -1486,8 +1488,7 @@ void KokkosSPGEMM
 
   timer1.reset();
 
-  //nnz_lno_t runcuda = atoi(getenv("runcuda"));
-  if (/*runcuda ||*/ MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) {
+  if (exec_gpu) {
     Kokkos::parallel_for( gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc);
   }
   else {
@@ -1682,6 +1683,7 @@ void KokkosSPGEMM
     b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>::
     KokkosSPGEMM_symbolic_triangle_setup(){
 
+  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
   nnz_lno_t n = this->row_mapB.extent(0) - 1;
   size_type nnz = this->entriesB.extent(0);
 
@@ -1733,7 +1735,7 @@ void KokkosSPGEMM
     }
 
     size_type bnnz =  set_index_entries.extent(0);
-    if (this->MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){
+    if (exec_gpu) {
       KokkosKernels::Impl::kkp_reduce_diff_view
       <size_type, MyExecSpace> (this->b_row_cnt, p_rowmapB_begins, p_rowmapB_ends, bnnz);
       if (KOKKOSKERNELS_VERBOSE){
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
index e59b95e8ac..ae913f864a 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
@@ -215,6 +215,10 @@ struct KokkosSPGEMM
 #if defined( KOKKOS_ENABLE_CUDA )
     case KokkosKernels::Impl::Exec_CUDA:
       return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+    case KokkosKernels::Impl::Exec_HIP:
+      return row_index;
 #endif
     }
   }
@@ -892,12 +896,13 @@ void KokkosSPGEMM
   const int num_left_side_nnz_per_row = 2;
   const nnz_lno_t * min_result_row_for_each_row = this->handle->get_spgemm_handle()->get_min_col_of_row().data();
   nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz();
+  constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
 
   typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
   int suggested_vector_size = this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz);
 
   //this kernel does not really work well if the vector size is less than 4.
-  if (suggested_vector_size < 4 && MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){
+  if (suggested_vector_size < 4 && exec_gpu) {
     if (KOKKOSKERNELS_VERBOSE) std::cout << "\tVecSize:" << suggested_vector_size << " Setting it to 4" << std::endl;
     suggested_vector_size = 4;
   }
@@ -960,29 +965,24 @@ void KokkosSPGEMM
 
   nnz_lno_t num_chunks = concurrency / suggested_vector_size;
   KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
-  if (MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) {
+  if (exec_gpu) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
-  }
-
-
-#if defined( KOKKOS_ENABLE_CUDA )
-  size_t free_byte ;
-  size_t total_byte ;
-  cudaMemGetInfo( &free_byte, &total_byte ) ;
-  size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t);
-  if (KOKKOSKERNELS_VERBOSE)
-    std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-  if (required_size + num_chunks > free_byte){
-    num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize;
-  }
-  {
-    nnz_lno_t min_chunk_size = 1;
-    while (min_chunk_size * 2 < num_chunks) {
-      min_chunk_size *= 2;
+    size_t free_byte, total_byte;
+    KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
+    size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t);
+    if (KOKKOSKERNELS_VERBOSE)
+      std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
+    if (required_size + num_chunks > free_byte){
+      num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize;
+    }
+    {
+      nnz_lno_t min_chunk_size = 1;
+      while (min_chunk_size * 2 < num_chunks) {
+        min_chunk_size *= 2;
+      }
+      num_chunks = min_chunk_size;
     }
-    num_chunks = min_chunk_size;
   }
-#endif
   if (KOKKOSKERNELS_VERBOSE){
     std::cout <<  "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. <<
         " num_chunks:" << num_chunks <<
@@ -1032,9 +1032,7 @@ void KokkosSPGEMM
 
   timer1.reset();
 
-  //nnz_lno_t runcuda = atoi(getenv("runcuda"));
-
-  if (/*runcuda ||*/ MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) {
+  if (exec_gpu) {
     Kokkos::parallel_for( gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc);
   }
   else {
diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
index a32d6689b9..2e12457822 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
@@ -219,6 +219,10 @@ namespace KokkosSparse{
 #if defined( KOKKOS_ENABLE_CUDA )
 	case KokkosKernels::Impl::Exec_CUDA:
 	  return row_index;
+#endif
+#if defined( KOKKOS_ENABLE_HIP )
+	case KokkosKernels::Impl::Exec_HIP:
+	  return row_index;
 #endif
 	}
       }
@@ -1181,6 +1185,8 @@ namespace KokkosSparse{
 				  dinv_view_t dinv,
 				  KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space)
     {
+      using pool_memory_space = KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t>;
+      constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>();
       if (KOKKOSKERNELS_VERBOSE){
 	std::cout << "\tSPARSE ACC MODE" << std::endl;
       }
@@ -1238,7 +1244,7 @@ namespace KokkosSparse{
 
       // Choose the SpGEMM algorithm and corresponding parameters
       if (this->spgemm_algorithm == SPGEMM_KK || this->spgemm_algorithm == SPGEMM_KK_LP){
-	if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+	if (exec_gpu) {
 	  size_type average_row_nnz = overall_nnz / this->a_row_cnt;
 	  size_t average_row_flops = original_overall_flops / this->a_row_cnt;
 
@@ -1310,7 +1316,7 @@ namespace KokkosSparse{
 	    }
 	  }
 	}
-	// If CUDA is not enabled, we decide whether we want to use a sparse or a dense acumulator 
+	// If non-GPU, we decide whether we want to use a sparse or a dense acumulator 
 	else {
 
 	  bool run_dense = false;
@@ -1364,7 +1370,7 @@ namespace KokkosSparse{
 
 
       // Compute the memory pool size
-      if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+      if (exec_gpu) {
 	if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){
 	  tmp_max_nnz = 1;
 	}
@@ -1397,11 +1403,9 @@ namespace KokkosSparse{
       }
       int num_chunks = concurrency / suggested_vector_size;
 
-#if defined( KOKKOS_ENABLE_CUDA )
-      if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) {
-	size_t free_byte ;
-	size_t total_byte ;
-	cudaMemGetInfo( &free_byte, &total_byte ) ;
+      if (exec_gpu) {
+	size_t free_byte, total_byte;
+        KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
 	size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
 	if (KOKKOSKERNELS_VERBOSE)
 	  std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
@@ -1414,7 +1418,6 @@ namespace KokkosSparse{
 	}
 	num_chunks = min_chunk_size;
       }
-#endif
 
       if (KOKKOSKERNELS_VERBOSE){
 	std::cout << "\t\t max_nnz: " << max_nnz
@@ -1428,11 +1431,10 @@ namespace KokkosSparse{
 
       // Allocate the memory pool
       KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
-      if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+      if (exec_gpu) {
 	my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
       }
 
-      typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
       Kokkos::Impl::Timer timer;
       pool_memory_space m_space(num_chunks, chunksize, -1,  my_pool_type);
       MyExecSpace().fence();
@@ -1470,7 +1472,7 @@ namespace KokkosSparse{
       }
       timer.reset();
 
-      if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){
+      if (exec_gpu) {
 	if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){
 	  if (thread_shmem_key_size <= 0) {
 	    std::cout << "KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl;
diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
index b14f781320..3389577497 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -48,6 +48,7 @@
 #include "KokkosKernels_Controls.hpp"
 #include "Kokkos_InnerProductSpaceTraits.hpp"
 #include "KokkosBlas1_scal.hpp"
+#include "KokkosKernels_ExecSpaceUtils.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 #include "KokkosSparse_spmv_impl_omp.hpp"
 
@@ -113,37 +114,30 @@ struct SPMV_Transpose_Functor {
   KOKKOS_INLINE_FUNCTION void
   operator() (const team_member& dev) const
   {
-    // This should be a thread loop as soon as we can use C++11
-    for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) {
+    const ordinal_type threadWork = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank()))
+      * rows_per_thread;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread),
+    [&](ordinal_type loop)
+    {
       // iRow represents a row of the matrix, so its correct type is
       // ordinal_type.
-      const ordinal_type iRow = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank()))
-                                * rows_per_thread + loop;
+      const ordinal_type iRow = threadWork + loop;
       if (iRow >= m_A.numRows ()) {
         return;
       }
 
       const auto row = m_A.rowConst (iRow);
       const ordinal_type row_length = row.length;
-
-#ifdef __CUDA_ARCH__
-      for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-           iEntry < row_length;
-           iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-      for (ordinal_type iEntry = 0;
-           iEntry < row_length;
-           iEntry ++)
-#endif
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length),
+      [&](ordinal_type iEntry)
       {
         const value_type val = conjugate ?
           ATV::conj (row.value(iEntry)) :
           row.value(iEntry);
         const ordinal_type ind = row.colidx(iEntry);
-
         Kokkos::atomic_add (&m_y(ind), static_cast<y_value_type> (alpha * val * m_x(iRow)));
-      }
-    }
+      });
+    });
   }
 };
 
@@ -234,11 +228,9 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th
 
   // Determine rows per thread
   if(rows_per_thread < 1) {
-    #ifdef KOKKOS_ENABLE_CUDA
-    if(std::is_same<Kokkos::Cuda,execution_space>::value)
+    if(KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
       rows_per_thread = 1;
     else
-    #endif
     {
       if(nnz_per_row < 20 && nnz > 5000000 ) {
         rows_per_thread = 256;
@@ -247,14 +239,12 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th
     }
   }
 
-  #ifdef KOKKOS_ENABLE_CUDA
   if(team_size < 1) {
-    if(std::is_same<Kokkos::Cuda,execution_space>::value)
+    if(KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
     { team_size = 256/vector_length; }
     else
     { team_size = 1; }
   }
-  #endif
 
   rows_per_team = rows_per_thread * team_size;
 
@@ -469,12 +459,14 @@ struct SPMV_MV_Transpose_Functor {
   KOKKOS_INLINE_FUNCTION void
   operator() (const team_member& dev) const
   {
-    // This should be a thread loop as soon as we can use C++11
-    for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) {
+    const ordinal_type threadWork = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank()))
+      * rows_per_thread;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread),
+    [&](ordinal_type loop)
+    {
       // iRow represents a row of the matrix, so its correct type is
       // ordinal_type.
-      const ordinal_type iRow = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank()))
-                                * rows_per_thread + loop;
+      const ordinal_type iRow = threadWork + loop;
       if (iRow >= m_A.numRows ()) {
         return;
       }
@@ -482,15 +474,8 @@ struct SPMV_MV_Transpose_Functor {
       const auto row = m_A.rowConst (iRow);
       const ordinal_type row_length = row.length;
 
-#ifdef __CUDA_ARCH__
-      for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-           iEntry < static_cast<ordinal_type> (row_length);
-           iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-      for (ordinal_type iEntry = 0;
-           iEntry < row_length;
-           iEntry ++)
-#endif
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length),
+      [&](ordinal_type iEntry)
       {
         const A_value_type val = conjugate ?
           Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
@@ -514,8 +499,8 @@ struct SPMV_MV_Transpose_Functor {
                                 static_cast<y_value_type> (val * m_x(iRow, k)));
           }
         }
-      }
-    }
+      });
+    });
   }
 };
 
@@ -527,7 +512,7 @@ template<class AMatrix,
          bool conjugate>
 struct SPMV_MV_LayoutLeft_Functor {
   typedef typename AMatrix::execution_space            execution_space;
-  typedef typename AMatrix::non_const_ordinal_type               ordinal_type;
+  typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
   typedef typename AMatrix::non_const_value_type       A_value_type;
   typedef typename YVector::non_const_value_type       y_value_type;
   typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
@@ -542,21 +527,23 @@ struct SPMV_MV_LayoutLeft_Functor {
   //! The number of columns in the input and output MultiVectors.
   ordinal_type n;
   ordinal_type rows_per_thread;
+  int vector_length;
 
   SPMV_MV_LayoutLeft_Functor (const coefficient_type& alpha_,
                               const AMatrix& m_A_,
                               const XVector& m_x_,
                               const coefficient_type& beta_,
                               const YVector& m_y_,
-                              const ordinal_type rows_per_thread_) :
+                              const ordinal_type rows_per_thread_,
+                              int vector_length_) :
     alpha (alpha_),
     m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)),
-    rows_per_thread (rows_per_thread_)
+    rows_per_thread (rows_per_thread_), vector_length(vector_length_)
   {}
 
   template<int UNROLL>
   KOKKOS_INLINE_FUNCTION void
-  strip_mine (const team_member& /* dev */, const ordinal_type& iRow, const ordinal_type& kk) const
+  strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const
   {
     y_value_type sum[UNROLL];
 
@@ -586,133 +573,80 @@ struct SPMV_MV_LayoutLeft_Functor {
 #ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
 #pragma loop count (15)
 #endif
-#ifdef __CUDA_ARCH__
-        for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-             iEntry < row.length;
-             iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-        for (ordinal_type iEntry = 0;
-             iEntry < row.length;
-             iEntry ++)
-#endif
-      {
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length),
+    [&](ordinal_type iEntry)
+    {
       const A_value_type val = conjugate ?
         Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
         row.value(iEntry);
       const ordinal_type ind = row.colidx(iEntry);
-
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
       for (int k = 0; k < UNROLL; ++k) {
         sum[k] += val * m_x(ind, kk + k);
       }
-    }
+    });
 
     if (doalpha == -1) {
       for (int ii=0; ii < UNROLL; ++ii) {
-        y_value_type sumt = sum[ii];
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-        if (blockDim.x > 1)
-          sumt += Kokkos::shfl_down(sumt, 1,blockDim.x);
-        if (blockDim.x > 2)
-          sumt += Kokkos::shfl_down(sumt, 2,blockDim.x);
-        if (blockDim.x > 4)
-          sumt += Kokkos::shfl_down(sumt, 4,blockDim.x);
-        if (blockDim.x > 8)
-          sumt += Kokkos::shfl_down(sumt, 8,blockDim.x);
-        if (blockDim.x > 16)
-          sumt += Kokkos::shfl_down(sumt, 16,blockDim.x);
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-        sum[ii] = -sumt;
+        y_value_type sumt;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length),
+        [&](ordinal_type, y_value_type& lsum)
+        {
+          //in this context, sum[ii] is a partial sum ii on one of the vector lanes.
+          lsum -= sum[ii];
+        }, sumt);
+        sum[ii] = sumt;
+        //that was an all-reduce, so sum[ii] is the same on every vector lane
       }
     }
     else {
       for (int ii=0; ii < UNROLL; ++ii) {
-        y_value_type sumt = sum[ii];
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-        if (blockDim.x > 1)
-          sumt += Kokkos::shfl_down(sumt, 1,blockDim.x);
-        if (blockDim.x > 2)
-          sumt += Kokkos::shfl_down(sumt, 2,blockDim.x);
-        if (blockDim.x > 4)
-          sumt += Kokkos::shfl_down(sumt, 4,blockDim.x);
-        if (blockDim.x > 8)
-          sumt += Kokkos::shfl_down(sumt, 8,blockDim.x);
-        if (blockDim.x > 16)
-          sumt += Kokkos::shfl_down(sumt, 16,blockDim.x);
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-        sum[ii] = sumt;
+        y_value_type sumt;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length),
+        [&](ordinal_type, y_value_type& lsum)
+        {
+          //in this context, sum[ii] is a partial sum ii on one of the vector lanes.
+          lsum += sum[ii];
+        }, sumt);
+        if(doalpha == 1)
+          sum[ii] = sumt;
+        else
+          sum[ii] = sumt * alpha;
       }
     }
 
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-    if (threadIdx.x==0)
-#else
-    if (true)
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-    {
-      if (doalpha * doalpha != 1) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-        for (int k = 0; k < UNROLL; ++k) {
-          sum[k] *= alpha;
-        }
-      }
-
-      if (dobeta == 0) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-        for (int k = 0; k < UNROLL; ++k) {
-          m_y(iRow, kk + k) = sum[k];
-        }
-      } else if (dobeta == 1) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-        for (int k = 0; k < UNROLL; ++k) {
-          m_y(iRow, kk + k) += sum[k];
-        }
-      } else if (dobeta == -1) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-        for (int k = 0; k < UNROLL; ++k) {
-          m_y(iRow, kk + k) = -m_y(iRow, kk + k) +  sum[k];
-        }
-      } else {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-        for (int k = 0; k < UNROLL; ++k) {
-          m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
-        }
-      }
+    if (dobeta == 0) {
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+      [&](ordinal_type k)
+      {
+        m_y(iRow, kk + k) = sum[k];
+      });
+    } else if (dobeta == 1) {
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+      [&](ordinal_type k)
+      {
+        m_y(iRow, kk + k) = sum[k];
+      });
+    } else if (dobeta == -1) {
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+      [&](ordinal_type k)
+      {
+        m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k];
+      });
+    } else {
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+      [&](ordinal_type k)
+      {
+        m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
+      });
     }
   }
 
   KOKKOS_INLINE_FUNCTION void
-  strip_mine_1 (const team_member& /* dev */, const ordinal_type& iRow) const
+  strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const
   {
-    y_value_type sum = Kokkos::Details::ArithTraits<y_value_type>::zero ();
-
     const auto row = m_A.rowConst (iRow);
 
     // The correct type of iEntry is ordinal_type, the type of the
@@ -720,48 +654,17 @@ struct SPMV_MV_LayoutLeft_Functor {
     // assume either that rows have no duplicate entries, or that rows
     // never have enough duplicate entries to overflow ordinal_type.
 
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
-#pragma loop count (15)
-#endif
-#ifdef __CUDA_ARCH__
-    for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-         iEntry < row.length;
-         iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-    for (ordinal_type iEntry = 0;
-         iEntry < row.length;
-         iEntry ++)
-#endif
+    y_value_type sum;
+    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, row.length),
+    [&](ordinal_type iEntry, y_value_type& lsum)
     {
       const A_value_type val = conjugate ?
           Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
           row.value(iEntry);
-      sum += val * m_x(row.colidx(iEntry),0);
-    }
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-    if (blockDim.x > 1)
-      sum += Kokkos::shfl_down(sum, 1,blockDim.x);
-    if (blockDim.x > 2)
-      sum += Kokkos::shfl_down(sum, 2,blockDim.x);
-    if (blockDim.x > 4)
-      sum += Kokkos::shfl_down(sum, 4,blockDim.x);
-    if (blockDim.x > 8)
-      sum += Kokkos::shfl_down(sum, 8,blockDim.x);
-    if (blockDim.x > 16)
-      sum += Kokkos::shfl_down(sum, 16,blockDim.x);
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-    if (threadIdx.x==0)
-#else
-    if (true)
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
+      lsum += val * m_x(row.colidx(iEntry),0);
+    });
+    Kokkos::single(Kokkos::PerThread(dev),
+    [&]()
     {
       if (doalpha == -1) {
         sum = -sum;
@@ -778,7 +681,7 @@ struct SPMV_MV_LayoutLeft_Functor {
       } else {
         m_y(iRow, 0) = beta * m_y(iRow, 0) + sum;
       }
-    }
+    });
   }
 
 
@@ -800,99 +703,17 @@ struct SPMV_MV_LayoutLeft_Functor {
       // needs to have the same type as n.
       ordinal_type kk = 0;
 
-#ifdef KOKKOS_FAST_COMPILE
+//#ifdef KOKKOS_FAST_COMPILE
       for (; kk + 4 <= n; kk += 4) {
         strip_mine<4>(dev, iRow, kk);
       }
       for( ; kk < n; ++kk) {
         strip_mine<1>(dev, iRow, kk);
       }
-#else
-#  ifdef __CUDA_ARCH__
-      if ((n > 8) && (n % 8 == 1)) {
-        strip_mine<9>(dev, iRow, kk);
-        kk += 9;
-      }
-      for(; kk + 8 <= n; kk += 8)
-        strip_mine<8>(dev, iRow, kk);
-      if(kk < n)
-        switch(n - kk) {
-#  else // NOT a CUDA device
-          if ((n > 16) && (n % 16 == 1)) {
-            strip_mine<17>(dev, iRow, kk);
-            kk += 17;
-          }
-
-          for (; kk + 16 <= n; kk += 16) {
-            strip_mine<16>(dev, iRow, kk);
-          }
-
-          if(kk < n)
-            switch(n - kk) {
-            case 15:
-              strip_mine<15>(dev, iRow, kk);
-              break;
-
-            case 14:
-              strip_mine<14>(dev, iRow, kk);
-              break;
-
-            case 13:
-              strip_mine<13>(dev, iRow, kk);
-              break;
-
-            case 12:
-              strip_mine<12>(dev, iRow, kk);
-              break;
-
-            case 11:
-              strip_mine<11>(dev, iRow, kk);
-              break;
-
-            case 10:
-              strip_mine<10>(dev, iRow, kk);
-              break;
-
-            case 9:
-              strip_mine<9>(dev, iRow, kk);
-              break;
-
-            case 8:
-              strip_mine<8>(dev, iRow, kk);
-              break;
-#  endif // __CUDA_ARCH__
-            case 7:
-              strip_mine<7>(dev, iRow, kk);
-              break;
-
-            case 6:
-              strip_mine<6>(dev, iRow, kk);
-              break;
-
-            case 5:
-              strip_mine<5>(dev, iRow, kk);
-              break;
-
-            case 4:
-              strip_mine<4>(dev, iRow, kk);
-              break;
-
-            case 3:
-              strip_mine<3>(dev, iRow, kk);
-              break;
-
-            case 2:
-              strip_mine<2>(dev, iRow, kk);
-              break;
-
-            case 1:
-              strip_mine_1(dev, iRow);
-              break;
-            }
-#endif // KOKKOS_FAST_COMPILE
-        }
+      //BMK: HERE
     }
-  };
+  }
+};
 
 
 template<class AMatrix,
@@ -934,7 +755,7 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a
 
     typedef SPMV_MV_LayoutLeft_Functor<AMatrix, XVector, YVector,
                                        doalpha, dobeta, conjugate> OpType;
-    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
 
@@ -957,7 +778,7 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
 
     // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
     // instead of int?  For example, if the number of threads is 1,
@@ -1115,7 +936,91 @@ spmv_alpha_mv (const char mode[],
   }
 }
 
-}
-}
+}}  //namespace KokkosSparse::Impl
 
 #endif // KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_
+      /*
+#else
+#  ifdef __CUDA_ARCH__
+      if ((n > 8) && (n % 8 == 1)) {
+        strip_mine<9>(dev, iRow, kk);
+        kk += 9;
+      }
+      for(; kk + 8 <= n; kk += 8)
+        strip_mine<8>(dev, iRow, kk);
+      if(kk < n) {
+        switch(n - kk) {
+#  else // NOT a CUDA device
+      if ((n > 16) && (n % 16 == 1)) {
+        strip_mine<17>(dev, iRow, kk);
+        kk += 17;
+      }
+
+      for (; kk + 16 <= n; kk += 16) {
+        strip_mine<16>(dev, iRow, kk);
+      }
+
+      if(kk < n) {
+        switch(n - kk) {
+        case 15:
+          strip_mine<15>(dev, iRow, kk);
+          break;
+
+        case 14:
+          strip_mine<14>(dev, iRow, kk);
+          break;
+
+        case 13:
+          strip_mine<13>(dev, iRow, kk);
+          break;
+
+        case 12:
+          strip_mine<12>(dev, iRow, kk);
+          break;
+
+        case 11:
+          strip_mine<11>(dev, iRow, kk);
+          break;
+
+        case 10:
+          strip_mine<10>(dev, iRow, kk);
+          break;
+
+        case 9:
+          strip_mine<9>(dev, iRow, kk);
+          break;
+
+        case 8:
+          strip_mine<8>(dev, iRow, kk);
+          break;
+#  endif // __CUDA_ARCH__
+        case 7:
+          strip_mine<7>(dev, iRow, kk);
+          break;
+
+        case 6:
+          strip_mine<6>(dev, iRow, kk);
+          break;
+
+        case 5:
+          strip_mine<5>(dev, iRow, kk);
+          break;
+
+        case 4:
+          strip_mine<4>(dev, iRow, kk);
+          break;
+
+        case 3:
+          strip_mine<3>(dev, iRow, kk);
+          break;
+
+        case 2:
+          strip_mine<2>(dev, iRow, kk);
+          break;
+
+        case 1:
+          strip_mine_1(dev, iRow);
+          break;
+        }
+#endif // KOKKOS_FAST_COMPILE
+  */
diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
index a9c62806fd..3575f87dca 100644
--- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
@@ -46,6 +46,7 @@
 #define KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_
 
 #include "Kokkos_InnerProductSpaceTraits.hpp"
+#include "KokkosKernels_ExecSpaceUtils.hpp"
 #include "KokkosBlas1_scal.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
 
@@ -91,12 +92,13 @@ struct SPMV_Struct_Transpose_Functor {
   KOKKOS_INLINE_FUNCTION void
   operator() (const team_member& dev) const
   {
-    // This should be a thread loop as soon as we can use C++11
-    for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) {
+    const ordinal_type teamWorkStart = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank())) * rows_per_thread;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread),
+    [&](ordinal_type loop)
+    {
       // iRow represents a row of the matrix, so its correct type is
       // ordinal_type.
-      const ordinal_type iRow = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank()))
-                                * rows_per_thread + loop;
+      ordinal_type iRow = teamWorkStart + loop;
       if (iRow >= m_A.numRows ()) {
         return;
       }
@@ -104,15 +106,8 @@ struct SPMV_Struct_Transpose_Functor {
       const auto row = m_A.rowConst (iRow);
       const ordinal_type row_length = row.length;
 
-#ifdef __CUDA_ARCH__
-      for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-           iEntry < row_length;
-           iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-      for (ordinal_type iEntry = 0;
-           iEntry < row_length;
-           iEntry ++)
-#endif
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length),
+      [&](ordinal_type iEntry)
       {
         const value_type val = conjugate ?
           ATV::conj (row.value(iEntry)) :
@@ -120,8 +115,8 @@ struct SPMV_Struct_Transpose_Functor {
         const ordinal_type ind = row.colidx(iEntry);
 
         Kokkos::atomic_add (&m_y(ind), static_cast<y_value_type> (alpha * val * m_x(iRow)));
-      }
-    }
+      });
+    });
   }
 };
 
@@ -302,7 +297,7 @@ struct SPMV_Struct_Functor {
       });
     dev.team_barrier();
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&] (const ordinal_type& loop) {
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team),[&] (const ordinal_type& loop) {
         const ordinal_type interiorIdx = static_cast<ordinal_type> ( dev.league_rank() ) * rows_per_team + loop;
         if(interiorIdx >= numInterior) { return; }
 
@@ -665,11 +660,9 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_
 
   // Determine rows per thread
   if(rows_per_thread < 1) {
-    #ifdef KOKKOS_ENABLE_CUDA
-    if(std::is_same<Kokkos::Cuda,execution_space>::value)
+    if(KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
       rows_per_thread = 1;
     else
-    #endif
     {
       if(nnz_per_row < 20 && numInterior*nnz_per_row > 5000000 ) {
         rows_per_thread = 256;
@@ -678,14 +671,12 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_
     }
   }
 
-  #ifdef KOKKOS_ENABLE_CUDA
   if(team_size < 1) {
-    if(std::is_same<Kokkos::Cuda,execution_space>::value)
+    if(KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>())
     { team_size = 128 / vector_length; }
     else
     { team_size = 1; }
   }
-  #endif
 
   rows_per_team = rows_per_thread * team_size;
 
@@ -903,27 +894,19 @@ struct SPMV_MV_Struct_Transpose_Functor {
   operator() (const team_member& dev) const
   {
     // This should be a thread loop as soon as we can use C++11
-    for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) {
-      // iRow represents a row of the matrix, so its correct type is
-      // ordinal_type.
-      const ordinal_type iRow = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank()))
-                                * rows_per_thread + loop;
+    const ordinal_type teamWorkStart = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank())) * rows_per_thread;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread),
+    [&](ordinal_type loop)
+    {
+      const ordinal_type iRow = teamWorkStart + loop;
       if (iRow >= m_A.numRows ()) {
         return;
       }
-
       const auto row = m_A.rowConst (iRow);
       const ordinal_type row_length = row.length;
 
-#ifdef __CUDA_ARCH__
-      for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-           iEntry < static_cast<ordinal_type> (row_length);
-           iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-      for (ordinal_type iEntry = 0;
-           iEntry < row_length;
-           iEntry ++)
-#endif
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length),
+      [&](ordinal_type iEntry)
       {
         const A_value_type val = conjugate ?
           Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
@@ -947,430 +930,251 @@ struct SPMV_MV_Struct_Transpose_Functor {
                                 static_cast<y_value_type> (val * m_x(iRow, k)));
           }
         }
-      }
-    }
+      });
+    });
   }
 };
 
-  template<class AMatrix,
-           class XVector,
-           class YVector,
-           int doalpha,
-           int dobeta,
-           bool conjugate>
-  struct SPMV_MV_Struct_LayoutLeft_Functor {
-    typedef typename AMatrix::execution_space            execution_space;
-    typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
-    typedef typename AMatrix::non_const_value_type       A_value_type;
-    typedef typename YVector::non_const_value_type       y_value_type;
-    typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
-    typedef typename team_policy::member_type            team_member;
-    typedef typename YVector::non_const_value_type       coefficient_type;
-
-    const coefficient_type alpha;
-    AMatrix m_A;
-    XVector m_x;
-    const coefficient_type beta;
-    YVector m_y;
-    //! The number of columns in the input and output MultiVectors.
-    ordinal_type n;
-    ordinal_type rows_per_thread;
-
-    SPMV_MV_Struct_LayoutLeft_Functor (const coefficient_type& alpha_,
-                                       const AMatrix& m_A_,
-                                       const XVector& m_x_,
-                                       const coefficient_type& beta_,
-                                       const YVector& m_y_,
-                                       const ordinal_type rows_per_thread_) :
-      alpha (alpha_),
-      m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)),
-      rows_per_thread (rows_per_thread_)
-    {}
-
-    template<int UNROLL>
-    KOKKOS_INLINE_FUNCTION void
-    strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const
-    {
-      y_value_type sum[UNROLL];
-
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-      for (int k = 0; k < UNROLL; ++k) {
-        sum[k] = Kokkos::Details::ArithTraits<y_value_type>::zero ();
-      }
+template<class AMatrix,
+         class XVector,
+         class YVector,
+         int doalpha,
+         int dobeta,
+         bool conjugate>
+struct SPMV_MV_Struct_LayoutLeft_Functor {
+  typedef typename AMatrix::execution_space            execution_space;
+  typedef typename AMatrix::non_const_ordinal_type     ordinal_type;
+  typedef typename AMatrix::non_const_value_type       A_value_type;
+  typedef typename YVector::non_const_value_type       y_value_type;
+  typedef typename Kokkos::TeamPolicy<execution_space> team_policy;
+  typedef typename team_policy::member_type            team_member;
+  typedef typename YVector::non_const_value_type       coefficient_type;
 
-      const auto row = m_A.rowConst (iRow);
+  const coefficient_type alpha;
+  AMatrix m_A;
+  XVector m_x;
+  const coefficient_type beta;
+  YVector m_y;
+  //! The number of columns in the input and output MultiVectors.
+  ordinal_type n;
+  ordinal_type rows_per_thread;
+  int vector_length;
+
+  SPMV_MV_Struct_LayoutLeft_Functor (const coefficient_type& alpha_,
+                                     const AMatrix& m_A_,
+                                     const XVector& m_x_,
+                                     const coefficient_type& beta_,
+                                     const YVector& m_y_,
+                                     const ordinal_type rows_per_thread_,
+                                     int vector_length_) :
+    alpha (alpha_),
+    m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)),
+    rows_per_thread (rows_per_thread_), vector_length(vector_length_)
+  {}
 
-      // The correct type of iEntry is ordinal_type, the type of the
-      // number of columns in the (local) matrix.  This is because we
-      // assume either that rows have no duplicate entries, or that rows
-      // never have enough duplicate entries to overflow ordinal_type.
+  template<int UNROLL>
+  KOKKOS_INLINE_FUNCTION void
+  strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const
+  {
+    y_value_type sum[UNROLL];
 
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
-#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
-#pragma loop count (15)
-#endif
-#ifdef __CUDA_ARCH__
-      for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-           iEntry < row.length;
-           iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-      for (ordinal_type iEntry = 0;
-           iEntry < row.length;
-           iEntry ++)
-#endif
-          {
-            const A_value_type val = conjugate ?
-              Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
-              row.value(iEntry);
-            const ordinal_type ind = row.colidx(iEntry);
+    for (int k = 0; k < UNROLL; ++k) {
+      sum[k] = Kokkos::Details::ArithTraits<y_value_type>::zero ();
+    }
+
+    const auto row = m_A.rowConst (iRow);
+
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length),
+    [&](ordinal_type iEntry)
+    {
+      const A_value_type val = conjugate ?
+        Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
+        row.value(iEntry);
+      const ordinal_type ind = row.colidx(iEntry);
 
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
 #pragma unroll
 #endif
-            for (int k = 0; k < UNROLL; ++k) {
-              sum[k] += val * m_x(ind, kk + k);
-            }
-          }
-
-      if (doalpha == -1) {
-        for (int ii=0; ii < UNROLL; ++ii) {
-          y_value_type sumt = sum[ii];
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-          if (blockDim.x > 1)
-            sumt += Kokkos::shfl_down(sumt, 1,blockDim.x);
-          if (blockDim.x > 2)
-            sumt += Kokkos::shfl_down(sumt, 2,blockDim.x);
-          if (blockDim.x > 4)
-            sumt += Kokkos::shfl_down(sumt, 4,blockDim.x);
-          if (blockDim.x > 8)
-            sumt += Kokkos::shfl_down(sumt, 8,blockDim.x);
-          if (blockDim.x > 16)
-            sumt += Kokkos::shfl_down(sumt, 16,blockDim.x);
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-          sum[ii] = -sumt;
-        }
+      for (int k = 0; k < UNROLL; ++k) {
+        sum[k] += val * m_x(ind, kk + k);
+      }
+    });
+
+    if (doalpha == -1) {
+      for (int ii=0; ii < UNROLL; ++ii) {
+        y_value_type sumt;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length),
+        [&](ordinal_type , y_value_type& lsum)
+        {
+          //in this context, sum[ii] is a partial sum ii on one of the vector lanes.
+          lsum -= sum[ii];
+        }, sumt);
+        sum[ii] = sumt;
+        //that was an all-reduce, so sum[ii] is the same on every vector lane
       }
-      else {
-        for (int ii=0; ii < UNROLL; ++ii) {
-          y_value_type sumt = sum[ii];
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-          if (blockDim.x > 1)
-            sumt += Kokkos::shfl_down(sumt, 1,blockDim.x);
-          if (blockDim.x > 2)
-            sumt += Kokkos::shfl_down(sumt, 2,blockDim.x);
-          if (blockDim.x > 4)
-            sumt += Kokkos::shfl_down(sumt, 4,blockDim.x);
-          if (blockDim.x > 8)
-            sumt += Kokkos::shfl_down(sumt, 8,blockDim.x);
-          if (blockDim.x > 16)
-            sumt += Kokkos::shfl_down(sumt, 16,blockDim.x);
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
+    }
+    else {
+      for (int ii=0; ii < UNROLL; ++ii) {
+        y_value_type sumt;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length),
+        [&](ordinal_type, y_value_type& lsum)
+        {
+          //in this context, sum[ii] is a partial sum ii on one of the vector lanes.
+          lsum += sum[ii];
+        }, sumt);
+        if(doalpha == 1)
           sum[ii] = sumt;
-        }
+        else
+          sum[ii] = sumt * alpha;
       }
-
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-      if (threadIdx.x==0)
-#else
-        if (true)
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-          {
-            if (doalpha * doalpha != 1) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-              for (int k = 0; k < UNROLL; ++k) {
-                sum[k] *= alpha;
-              }
-            }
-
-            if (dobeta == 0) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-              for (int k = 0; k < UNROLL; ++k) {
-                m_y(iRow, kk + k) = sum[k];
-              }
-            } else if (dobeta == 1) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-              for (int k = 0; k < UNROLL; ++k) {
-                m_y(iRow, kk + k) += sum[k];
-              }
-            } else if (dobeta == -1) {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-              for (int k = 0; k < UNROLL; ++k) {
-                m_y(iRow, kk + k) = -m_y(iRow, kk + k) +  sum[k];
-              }
-            } else {
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-              for (int k = 0; k < UNROLL; ++k) {
-                m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
-              }
-            }
-          }
     }
 
-    KOKKOS_INLINE_FUNCTION void
-    strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const
+    Kokkos::single(Kokkos::PerThread(dev),
+    [&]()
     {
-      y_value_type sum = Kokkos::Details::ArithTraits<y_value_type>::zero ();
-
-      const auto row = m_A.rowConst (iRow);
+      if (dobeta == 0) {
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+        [&](ordinal_type k)
+        {
+          m_y(iRow, kk + k) = sum[k];
+        });
+      } else if (dobeta == 1) {
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+        [&](ordinal_type k)
+        {
+          m_y(iRow, kk + k) += sum[k];
+        });
+      } else if (dobeta == -1) {
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+        [&](ordinal_type k)
+        {
+          m_y(iRow, kk + k) = -m_y(iRow, kk + k) +  sum[k];
+        });
+      } else {
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
+        [&](ordinal_type k)
+        {
+          m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
+        });
+      }
+    });
+  }
 
-      // The correct type of iEntry is ordinal_type, the type of the
-      // number of columns in the (local) matrix.  This is because we
-      // assume either that rows have no duplicate entries, or that rows
-      // never have enough duplicate entries to overflow ordinal_type.
+  KOKKOS_INLINE_FUNCTION void
+  strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const
+  {
+    const auto row = m_A.rowConst (iRow);
 
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
-#pragma loop count (15)
-#endif
-#ifdef __CUDA_ARCH__
-      for (ordinal_type iEntry = static_cast<ordinal_type> (threadIdx.x);
-           iEntry < row.length;
-           iEntry += static_cast<ordinal_type> (blockDim.x))
-#else
-        for (ordinal_type iEntry = 0;
-             iEntry < row.length;
-             iEntry ++)
-#endif
-          {
-            const A_value_type val = conjugate ?
-              Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
-              row.value(iEntry);
-            sum += val * m_x(row.colidx(iEntry),0);
-          }
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-      if (blockDim.x > 1)
-        sum += Kokkos::shfl_down(sum, 1,blockDim.x);
-      if (blockDim.x > 2)
-        sum += Kokkos::shfl_down(sum, 2,blockDim.x);
-      if (blockDim.x > 4)
-        sum += Kokkos::shfl_down(sum, 4,blockDim.x);
-      if (blockDim.x > 8)
-        sum += Kokkos::shfl_down(sum, 8,blockDim.x);
-      if (blockDim.x > 16)
-        sum += Kokkos::shfl_down(sum, 16,blockDim.x);
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-
-#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-      if (threadIdx.x==0)
-#else
-        if (true)
-#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA)
-          {
-            if (doalpha == -1) {
-              sum = -sum;
-            } else if (doalpha * doalpha != 1) {
-              sum *= alpha;
-            }
+    y_value_type sum;
+    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, row.length),
+    [&](ordinal_type iEntry, y_value_type& lsum)
+    {
+      const A_value_type val = conjugate ?
+        Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
+        row.value(iEntry);
+      lsum += val * m_x(row.colidx(iEntry),0);
+    }, sum);
+
+    Kokkos::single(Kokkos::PerThread(dev),
+    [&]()
+    {
+      if (doalpha == -1) {
+        sum = -sum;
+      } else if (doalpha * doalpha != 1) {
+        sum *= alpha;
+      }
 
-            if (dobeta == 0) {
-              m_y(iRow, 0) = sum ;
-            } else if (dobeta == 1) {
-              m_y(iRow, 0) += sum ;
-            } else if (dobeta == -1) {
-              m_y(iRow, 0) = -m_y(iRow, 0) +  sum;
-            } else {
-              m_y(iRow, 0) = beta * m_y(iRow, 0) + sum;
-            }
-          }
-    }
+      if (dobeta == 0) {
+        m_y(iRow, 0) = sum;
+      } else if (dobeta == 1) {
+        m_y(iRow, 0) += sum;
+      } else if (dobeta == -1) {
+        m_y(iRow, 0) = -m_y(iRow, 0) +  sum;
+      } else {
+        m_y(iRow, 0) = beta * m_y(iRow, 0) + sum;
+      }
+    });
+  }
 
 
-    KOKKOS_INLINE_FUNCTION void
-    operator() (const team_member& dev) const
-    {
-      for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) {
+  KOKKOS_INLINE_FUNCTION void
+  operator() (const team_member& dev) const
+  {
+    for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) {
 
-        // iRow indexes over (local) rows of the matrix, so its correct
-        // type is ordinal_type.
+      // iRow indexes over (local) rows of the matrix, so its correct
+      // type is ordinal_type.
 
-        const ordinal_type iRow = (dev.league_rank() * dev.team_size() + dev.team_rank())
-          * rows_per_thread + loop;
-        if (iRow >= m_A.numRows ()) {
-          return;
-        }
+      const ordinal_type iRow = (dev.league_rank() * dev.team_size() + dev.team_rank())
+        * rows_per_thread + loop;
+      if (iRow >= m_A.numRows ()) {
+        return;
+      }
 
-        // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it
-        // needs to have the same type as n.
-        ordinal_type kk = 0;
+      // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it
+      // needs to have the same type as n.
+      ordinal_type kk = 0;
 
-#ifdef KOKKOS_FAST_COMPILE
-        for (; kk + 4 <= n; kk += 4) {
-          strip_mine<4>(dev, iRow, kk);
-        }
-        for( ; kk < n; ++kk) {
-          strip_mine<1>(dev, iRow, kk);
-        }
-#else
-#  ifdef __CUDA_ARCH__
-        if ((n > 8) && (n % 8 == 1)) {
-          strip_mine<9>(dev, iRow, kk);
-          kk += 9;
-        }
-        for(; kk + 8 <= n; kk += 8)
-          strip_mine<8>(dev, iRow, kk);
-        if(kk < n)
-          switch(n - kk) {
-#  else // NOT a CUDA device
-            if ((n > 16) && (n % 16 == 1)) {
-              strip_mine<17>(dev, iRow, kk);
-              kk += 17;
-            }
+//#ifdef KOKKOS_FAST_COMPILE
+      for (; kk + 4 <= n; kk += 4) {
+        strip_mine<4>(dev, iRow, kk);
+      }
+      for( ; kk < n; ++kk) {
+        strip_mine<1>(dev, iRow, kk);
+      }
+      //BMK: HERE
+    }
+  }
+};
 
-            for (; kk + 16 <= n; kk += 16) {
-              strip_mine<16>(dev, iRow, kk);
-            }
 
-            if(kk < n)
-              switch(n - kk) {
-              case 15:
-                strip_mine<15>(dev, iRow, kk);
-                break;
-
-              case 14:
-                strip_mine<14>(dev, iRow, kk);
-                break;
-
-              case 13:
-                strip_mine<13>(dev, iRow, kk);
-                break;
-
-              case 12:
-                strip_mine<12>(dev, iRow, kk);
-                break;
-
-              case 11:
-                strip_mine<11>(dev, iRow, kk);
-                break;
-
-              case 10:
-                strip_mine<10>(dev, iRow, kk);
-                break;
-
-              case 9:
-                strip_mine<9>(dev, iRow, kk);
-                break;
-
-              case 8:
-                strip_mine<8>(dev, iRow, kk);
-                break;
-#  endif // __CUDA_ARCH__
-              case 7:
-                strip_mine<7>(dev, iRow, kk);
-                break;
-
-              case 6:
-                strip_mine<6>(dev, iRow, kk);
-                break;
-
-              case 5:
-                strip_mine<5>(dev, iRow, kk);
-                break;
-
-              case 4:
-                strip_mine<4>(dev, iRow, kk);
-                break;
-
-              case 3:
-                strip_mine<3>(dev, iRow, kk);
-                break;
-
-              case 2:
-                strip_mine<2>(dev, iRow, kk);
-                break;
-
-              case 1:
-                strip_mine_1(dev, iRow);
-                break;
-              }
-#endif // KOKKOS_FAST_COMPILE
-          }
-      }
-    };
-
-
-    template<class AMatrix,
-             class XVector,
-             class YVector,
-             int doalpha,
-             int dobeta,
-             bool conjugate>
-    static void
-    spmv_alpha_beta_mv_struct_no_transpose (const typename YVector::non_const_value_type& alpha,
-                                            const AMatrix& A,
-                                            const XVector& x,
-                                            const typename YVector::non_const_value_type& beta,
-                                            const YVector& y)
-    {
-      typedef typename AMatrix::ordinal_type ordinal_type;
+  template<class AMatrix,
+           class XVector,
+           class YVector,
+           int doalpha,
+           int dobeta,
+           bool conjugate>
+  static void
+  spmv_alpha_beta_mv_struct_no_transpose (const typename YVector::non_const_value_type& alpha,
+                                          const AMatrix& A,
+                                          const XVector& x,
+                                          const typename YVector::non_const_value_type& beta,
+                                          const YVector& y)
+  {
+    typedef typename AMatrix::ordinal_type ordinal_type;
 
-      if (A.numRows () <= static_cast<ordinal_type> (0)) {
-        return;
-      }
-      if (doalpha == 0) {
-        if (dobeta != 1) {
-          KokkosBlas::scal (y, beta, y);
-        }
-        return;
+    if (A.numRows () <= static_cast<ordinal_type> (0)) {
+      return;
+    }
+    if (doalpha == 0) {
+      if (dobeta != 1) {
+        KokkosBlas::scal (y, beta, y);
       }
-      else {
-        typedef typename AMatrix::size_type size_type;
+      return;
+    }
+    else {
+      typedef typename AMatrix::size_type size_type;
 
-        // Assuming that no row contains duplicate entries, NNZPerRow
-        // cannot be more than the number of columns of the matrix.  Thus,
-        // the appropriate type is ordinal_type.
-        const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
+      // Assuming that no row contains duplicate entries, NNZPerRow
+      // cannot be more than the number of columns of the matrix.  Thus,
+      // the appropriate type is ordinal_type.
+      const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
 
-        int vector_length = 1;
-        while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
+      int vector_length = 1;
+      while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
 
 #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
 
-        typedef SPMV_MV_Struct_LayoutLeft_Functor<AMatrix, XVector, YVector,
-                                                  doalpha, dobeta, conjugate> OpType;
-        OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+      typedef SPMV_MV_Struct_LayoutLeft_Functor<AMatrix, XVector, YVector,
+                                                doalpha, dobeta, conjugate> OpType;
+      OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
 
-        typename AMatrix::const_ordinal_type nrow = A.numRows();
+      typename AMatrix::const_ordinal_type nrow = A.numRows();
 
+<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb
         // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
         // instead of int?  For example, if the number of threads is 1,
         // then this is just the number of rows.  Ditto for rows_per_team.
@@ -1382,16 +1186,34 @@ struct SPMV_MV_Struct_Transpose_Functor {
         const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
         Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,NoTranspose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
                              ( nteams , team_size , vector_length ) , op );
+=======
+      // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
+      // instead of int?  For example, if the number of threads is 1,
+      // then this is just the number of rows.  Ditto for rows_per_team.
+      // team_size is a hardware resource thing so it might legitimately
+      // be int.
+      const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+      const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
+#else
+      const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+#endif
+      const int rows_per_team = rows_per_thread * team_size;
+      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
+      Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,NoTranspose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
+                           ( nteams , team_size , vector_length ) , op );
+>>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI
 
 #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
 
-        typedef SPMV_MV_Struct_LayoutLeft_Functor<AMatrix, XVector, YVector,
-                                                  2, 2, conjugate> OpType;
+      typedef SPMV_MV_Struct_LayoutLeft_Functor<AMatrix, XVector, YVector,
+                                                2, 2, conjugate> OpType;
 
-        typename AMatrix::const_ordinal_type nrow = A.numRows();
+      typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-        OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+      OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
 
+<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb
         // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
         // instead of int?  For example, if the number of threads is 1,
         // then this is just the number of rows.  Ditto for rows_per_team.
@@ -1403,55 +1225,73 @@ struct SPMV_MV_Struct_Transpose_Functor {
         const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
         Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,NoTranspose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
                              ( nteams , team_size , vector_length ) , op );
+=======
+      // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
+      // instead of int?  For example, if the number of threads is 1,
+      // then this is just the number of rows.  Ditto for rows_per_team.
+      // team_size is a hardware resource thing so it might legitimately
+      // be int.
+      const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+      const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
+#else
+      const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+#endif
+      const int rows_per_team = rows_per_thread * team_size;
+      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
+      Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,NoTranspose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
+                           ( nteams , team_size , vector_length ) , op );
+>>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI
 
 #endif // KOKKOS_FAST_COMPILE
-      }
     }
+  }
 
-    template<class AMatrix,
-             class XVector,
-             class YVector,
-             int doalpha,
-             int dobeta,
-             bool conjugate>
-    static void
-    spmv_alpha_beta_mv_struct_transpose (const typename YVector::non_const_value_type& alpha,
-                                         const AMatrix& A,
-                                         const XVector& x,
-                                         const typename YVector::non_const_value_type& beta,
-                                         const YVector& y)
-    {
-      typedef typename AMatrix::ordinal_type ordinal_type;
+  template<class AMatrix,
+           class XVector,
+           class YVector,
+           int doalpha,
+           int dobeta,
+           bool conjugate>
+  static void
+  spmv_alpha_beta_mv_struct_transpose (const typename YVector::non_const_value_type& alpha,
+                                       const AMatrix& A,
+                                       const XVector& x,
+                                       const typename YVector::non_const_value_type& beta,
+                                       const YVector& y)
+  {
+    typedef typename AMatrix::ordinal_type ordinal_type;
 
-      if (A.numRows () <= static_cast<ordinal_type> (0)) {
-        return;
-      }
+    if (A.numRows () <= static_cast<ordinal_type> (0)) {
+      return;
+    }
 
-      // We need to scale y first ("scaling" by zero just means filling
-      // with zeros), since the functor works by atomic-adding into y.
-      if (dobeta != 1) {
-        KokkosBlas::scal (y, beta, y);
-      }
+    // We need to scale y first ("scaling" by zero just means filling
+    // with zeros), since the functor works by atomic-adding into y.
+    if (dobeta != 1) {
+      KokkosBlas::scal (y, beta, y);
+    }
 
-      if (doalpha != 0) {
-        typedef typename AMatrix::size_type size_type;
+    if (doalpha != 0) {
+      typedef typename AMatrix::size_type size_type;
 
-        // Assuming that no row contains duplicate entries, NNZPerRow
-        // cannot be more than the number of columns of the matrix.  Thus,
-        // the appropriate type is ordinal_type.
-        const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
+      // Assuming that no row contains duplicate entries, NNZPerRow
+      // cannot be more than the number of columns of the matrix.  Thus,
+      // the appropriate type is ordinal_type.
+      const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
 
-        int vector_length = 1;
-        while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
+      int vector_length = 1;
+      while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
 
 #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
 
-        typedef SPMV_MV_Struct_Transpose_Functor<AMatrix, XVector, YVector,
-                                                 doalpha, dobeta, conjugate> OpType;
-        OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+      typedef SPMV_MV_Struct_Transpose_Functor<AMatrix, XVector, YVector,
+                                               doalpha, dobeta, conjugate> OpType;
+      OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
 
-        typename AMatrix::const_ordinal_type nrow = A.numRows();
+      typename AMatrix::const_ordinal_type nrow = A.numRows();
 
+<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb
         // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
         // instead of int?  For example, if the number of threads is 1,
         // then this is just the number of rows.  Ditto for rows_per_team.
@@ -1463,16 +1303,34 @@ struct SPMV_MV_Struct_Transpose_Functor {
         const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
         Kokkos::parallel_for ("KokkosSparse::spmv_struct<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
                               ( nteams , team_size , vector_length ) , op );
+=======
+      // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
+      // instead of int?  For example, if the number of threads is 1,
+      // then this is just the number of rows.  Ditto for rows_per_team.
+      // team_size is a hardware resource thing so it might legitimately
+      // be int.
+      const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+      const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
+#else
+      const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+#endif
+      const int rows_per_team = rows_per_thread * team_size;
+      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
+      Kokkos::parallel_for ("KokkosSparse::spmv_struct<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
+                            ( nteams , team_size , vector_length ) , op );
+>>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI
 
 #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
 
-        typedef SPMV_MV_Struct_Transpose_Functor<AMatrix, XVector, YVector,
-                                                 2, 2, conjugate, SizeType> OpType;
+      typedef SPMV_MV_Struct_Transpose_Functor<AMatrix, XVector, YVector,
+                                               2, 2, conjugate, SizeType> OpType;
 
-        typename AMatrix::const_ordinal_type nrow = A.numRows();
+      typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-        OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+      OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
 
+<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb
         // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
         // instead of int?  For example, if the number of threads is 1,
         // then this is just the number of rows.  Ditto for rows_per_team.
@@ -1484,73 +1342,176 @@ struct SPMV_MV_Struct_Transpose_Functor {
         const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
         Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
                              ( nteams , team_size , vector_length ) , op );
+=======
+      // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
+      // instead of int?  For example, if the number of threads is 1,
+      // then this is just the number of rows.  Ditto for rows_per_team.
+      // team_size is a hardware resource thing so it might legitimately
+      // be int.
+      const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+      const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
+#else
+      const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+#endif
+      const int rows_per_team = rows_per_thread * team_size;
+      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
+      Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
+                           ( nteams , team_size , vector_length ) , op );
+>>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI
 
 #endif // KOKKOS_FAST_COMPILE
-      }
     }
+  }
 
-    template<class AMatrix,
-             class XVector,
-             class YVector,
-             int doalpha,
-             int dobeta>
-    static void
-    spmv_alpha_beta_mv_struct (const char mode[],
-                               const typename YVector::non_const_value_type& alpha,
-                               const AMatrix& A,
-                               const XVector& x,
-                               const typename YVector::non_const_value_type& beta,
-                               const YVector& y)
-    {
-      if (mode[0] == NoTranspose[0]) {
-        spmv_alpha_beta_mv_struct_no_transpose<AMatrix, XVector, YVector, doalpha, dobeta, false> (alpha, A, x, beta, y);
-      }
-      else if (mode[0] == Conjugate[0]) {
-        spmv_alpha_beta_mv_struct_no_transpose<AMatrix, XVector, YVector, doalpha, dobeta, true> (alpha, A, x, beta, y);
-      }
-      else if (mode[0] == Transpose[0]) {
-        spmv_alpha_beta_mv_struct_transpose<AMatrix, XVector, YVector, doalpha, dobeta, false> (alpha, A, x, beta, y);
-      }
-      else if (mode[0] == ConjugateTranspose[0]) {
-        spmv_alpha_beta_mv_struct_transpose<AMatrix, XVector, YVector, doalpha, dobeta, true> (alpha, A, x, beta, y);
-      }
-      else {
-        Kokkos::Impl::throw_runtime_exception ("Invalid Transpose Mode for KokkosSparse::spmv()");
-      }
+  template<class AMatrix,
+           class XVector,
+           class YVector,
+           int doalpha,
+           int dobeta>
+  static void
+  spmv_alpha_beta_mv_struct (const char mode[],
+                             const typename YVector::non_const_value_type& alpha,
+                             const AMatrix& A,
+                             const XVector& x,
+                             const typename YVector::non_const_value_type& beta,
+                             const YVector& y)
+  {
+    if (mode[0] == NoTranspose[0]) {
+      spmv_alpha_beta_mv_struct_no_transpose<AMatrix, XVector, YVector, doalpha, dobeta, false> (alpha, A, x, beta, y);
     }
-
-    template<class AMatrix,
-             class XVector,
-             class YVector,
-             int doalpha>
-    void
-    spmv_alpha_mv_struct (const char mode[],
-                          const typename YVector::non_const_value_type& alpha,
-                          const AMatrix& A,
-                          const XVector& x,
-                          const typename YVector::non_const_value_type& beta,
-                          const YVector& y)
-    {
-      typedef typename YVector::non_const_value_type coefficient_type;
-      typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
-
-      if (beta == KAT::zero ()) {
-        spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 0> (mode, alpha, A, x, beta, y);
-      }
-      else if (beta == KAT::one ()) {
-        spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 1> (mode, alpha, A, x, beta, y);
-      }
-      else if (beta == -KAT::one ()) {
-        spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, -1> (mode, alpha, A, x, beta, y);
-      }
-      else {
-        spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 2> (mode, alpha, A, x, beta, y);
-      }
+    else if (mode[0] == Conjugate[0]) {
+      spmv_alpha_beta_mv_struct_no_transpose<AMatrix, XVector, YVector, doalpha, dobeta, true> (alpha, A, x, beta, y);
     }
+    else if (mode[0] == Transpose[0]) {
+      spmv_alpha_beta_mv_struct_transpose<AMatrix, XVector, YVector, doalpha, dobeta, false> (alpha, A, x, beta, y);
+    }
+    else if (mode[0] == ConjugateTranspose[0]) {
+      spmv_alpha_beta_mv_struct_transpose<AMatrix, XVector, YVector, doalpha, dobeta, true> (alpha, A, x, beta, y);
+    }
+    else {
+      Kokkos::Impl::throw_runtime_exception ("Invalid Transpose Mode for KokkosSparse::spmv()");
+    }
+  }
 
+  template<class AMatrix,
+           class XVector,
+           class YVector,
+           int doalpha>
+  void
+  spmv_alpha_mv_struct (const char mode[],
+                        const typename YVector::non_const_value_type& alpha,
+                        const AMatrix& A,
+                        const XVector& x,
+                        const typename YVector::non_const_value_type& beta,
+                        const YVector& y)
+  {
+    typedef typename YVector::non_const_value_type coefficient_type;
+    typedef Kokkos::Details::ArithTraits<coefficient_type> KAT;
 
+    if (beta == KAT::zero ()) {
+      spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 0> (mode, alpha, A, x, beta, y);
+    }
+    else if (beta == KAT::one ()) {
+      spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 1> (mode, alpha, A, x, beta, y);
+    }
+    else if (beta == -KAT::one ()) {
+      spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, -1> (mode, alpha, A, x, beta, y);
+    }
+    else {
+      spmv_alpha_beta_mv_struct<AMatrix, XVector, YVector, doalpha, 2> (mode, alpha, A, x, beta, y);
+    }
+  }
 
 }
 }
 
 #endif // KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_
+      /*
+#else
+#  ifdef __CUDA_ARCH__
+      if ((n > 8) && (n % 8 == 1)) {
+        strip_mine<9>(dev, iRow, kk);
+        kk += 9;
+      }
+      for(; kk + 8 <= n; kk += 8)
+        strip_mine<8>(dev, iRow, kk);
+      if(kk < n)
+      {
+        switch(n - kk) {
+#  else // NOT a CUDA device
+      if ((n > 16) && (n % 16 == 1)) {
+        strip_mine<17>(dev, iRow, kk);
+        kk += 17;
+      }
+
+      for (; kk + 16 <= n; kk += 16) {
+        strip_mine<16>(dev, iRow, kk);
+      }
+
+      if(kk < n)
+      {
+        switch(n - kk) {
+          case 15:
+            strip_mine<15>(dev, iRow, kk);
+            break;
+
+          case 14:
+            strip_mine<14>(dev, iRow, kk);
+            break;
+
+          case 13:
+            strip_mine<13>(dev, iRow, kk);
+            break;
+
+          case 12:
+            strip_mine<12>(dev, iRow, kk);
+            break;
+
+          case 11:
+            strip_mine<11>(dev, iRow, kk);
+            break;
+
+          case 10:
+            strip_mine<10>(dev, iRow, kk);
+            break;
+
+          case 9:
+            strip_mine<9>(dev, iRow, kk);
+            break;
+
+          case 8:
+            strip_mine<8>(dev, iRow, kk);
+            break;
+  #endif // __CUDA_ARCH__
+          case 7:
+            strip_mine<7>(dev, iRow, kk);
+            break;
+
+          case 6:
+            strip_mine<6>(dev, iRow, kk);
+            break;
+
+          case 5:
+            strip_mine<5>(dev, iRow, kk);
+            break;
+
+          case 4:
+            strip_mine<4>(dev, iRow, kk);
+            break;
+
+          case 3:
+            strip_mine<3>(dev, iRow, kk);
+            break;
+
+          case 2:
+            strip_mine<2>(dev, iRow, kk);
+            break;
+
+          case 1:
+            strip_mine_1(dev, iRow);
+            break;
+        }
+      }
+#endif // KOKKOS_FAST_COMPILE
+    */
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
index a9ffcd282a..271d8b2396 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
@@ -2464,6 +2464,23 @@ struct ReturnRangePolicyType<Kokkos::Cuda> {
   }
 };
 #endif
+#ifdef KOKKOS_ENABLE_HIP
+template <>
+struct ReturnRangePolicyType<Kokkos::Experimental::HIP> {
+  using PolicyType = Kokkos::RangePolicy<Kokkos::Experimental::HIP>;
+
+  static inline
+  PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt,ts);
+  }
+
+  template <class ExecInstanceType>
+  static inline
+  PolicyType get_policy(int nt, int ts, ExecInstanceType stream) {
+    return PolicyType(stream,nt,ts);
+  }
+};
+#endif
 
 template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType >
 void lower_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) {
diff --git a/test_common/KokkosKernels_TestParameters.hpp b/test_common/KokkosKernels_TestParameters.hpp
index 295b46df9b..c069c618e6 100644
--- a/test_common/KokkosKernels_TestParameters.hpp
+++ b/test_common/KokkosKernels_TestParameters.hpp
@@ -72,6 +72,7 @@ struct Parameters{
   int use_threads;
   int use_openmp;
   int use_cuda;
+  int use_hip;
   int use_serial;
   int a_mem_space, b_mem_space, c_mem_space, work_mem_space;
 
@@ -121,6 +122,7 @@ struct Parameters{
     use_threads = 0;
     use_openmp = 0;
     use_cuda = 0;
+    use_hip = 0;
     use_serial = 0;
     a_mem_space = b_mem_space = c_mem_space = work_mem_space = 1;
     a_mtx_bin_file = b_mtx_bin_file = c_mtx_bin_file = NULL;

From 27e0a29071da9fd153e07a88c6a87244f127bb2f Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Fri, 9 Oct 2020 12:56:39 -0600
Subject: [PATCH 02/18] Fixed spmv for OpenMP

---
 src/common/KokkosKernels_SparseUtils.hpp      |   2 -
 .../impl/KokkosSparse_gauss_seidel_impl.hpp   |   1 -
 .../impl/KokkosSparse_spgemm_impl_def.hpp     |   1 -
 src/sparse/impl/KokkosSparse_spmv_impl.hpp    | 182 +++++++++---------
 .../impl/KokkosSparse_spmv_struct_impl.hpp    | 177 +++++++++--------
 5 files changed, 178 insertions(+), 185 deletions(-)

diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp
index 7628e6de31..02ab3a50b7 100644
--- a/src/common/KokkosKernels_SparseUtils.hpp
+++ b/src/common/KokkosKernels_SparseUtils.hpp
@@ -1341,8 +1341,6 @@ void kk_sort_graph(
 
     out_nnz_view_t out_adj,
     out_scalar_view_t out_vals){
-  ExecSpaceType exec = kk_get_exec_space_type<MyExecSpace>();
-
   // If possible, sort on host and avoid a deep copy
   // TODO BMK: can this function be deprecated?
   typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj);
diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
index d956ed8d4d..d5c111862f 100644
--- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
@@ -826,7 +826,6 @@ namespace KokkosSparse{
           nnz_lno_t num_values_in_l2 = 0;
           nnz_lno_t num_big_rows = 0;
 
-          KokkosKernels::Impl::ExecSpaceType ex_sp = this->handle->get_handle_exec_space();
           if (!KokkosKernels::Impl::kk_is_gpu_exec_space<MyExecSpace>()) {
             //again, if it is on CPUs, we make L1 as big as we need.
             size_t l1mem = 1;
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
index 4924e11b0c..8fdf276e61 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp
@@ -121,7 +121,6 @@ void KokkosSPGEMM
     //number of rows and nnzs
     nnz_lno_t n = this->row_mapB.extent(0) - 1;
     size_type nnz = this->entriesB.extent(0);
-    KokkosKernels::Impl::ExecSpaceType my_exec_space_ = KokkosKernels::Impl::get_exec_space_type<MyExecSpace>();
 
     bool compress_in_single_step = this->handle->get_spgemm_handle()->get_compression_step();
     //compress in single step if it is GPU.
diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
index 3389577497..4645a08b63 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -366,7 +366,8 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha,
   const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
 
   int vector_length = 1;
-  while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2;
+  if(KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>())
+    while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2;
 
   typedef SPMV_Transpose_Functor<AMatrix, XVector, YVector, dobeta, conjugate> OpType;
 
@@ -627,7 +628,7 @@ struct SPMV_MV_LayoutLeft_Functor {
       Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
       [&](ordinal_type k)
       {
-        m_y(iRow, kk + k) = sum[k];
+        m_y(iRow, kk + k) = m_y(iRow, kk + k) + sum[k];
       });
     } else if (dobeta == -1) {
       Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL),
@@ -662,7 +663,7 @@ struct SPMV_MV_LayoutLeft_Functor {
           Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
           row.value(iEntry);
       lsum += val * m_x(row.colidx(iEntry),0);
-    });
+    }, sum);
     Kokkos::single(Kokkos::PerThread(dev),
     [&]()
     {
@@ -703,14 +704,97 @@ struct SPMV_MV_LayoutLeft_Functor {
       // needs to have the same type as n.
       ordinal_type kk = 0;
 
-//#ifdef KOKKOS_FAST_COMPILE
+#ifdef KOKKOS_FAST_COMPILE
       for (; kk + 4 <= n; kk += 4) {
         strip_mine<4>(dev, iRow, kk);
       }
       for( ; kk < n; ++kk) {
         strip_mine<1>(dev, iRow, kk);
       }
-      //BMK: HERE
+#else
+#  ifdef __CUDA_ARCH__
+      if ((n > 8) && (n % 8 == 1)) {
+        strip_mine<9>(dev, iRow, kk);
+        kk += 9;
+      }
+      for(; kk + 8 <= n; kk += 8)
+        strip_mine<8>(dev, iRow, kk);
+      if(kk < n) {
+        switch(n - kk) {
+#  else // NOT a CUDA device
+      if ((n > 16) && (n % 16 == 1)) {
+        strip_mine<17>(dev, iRow, kk);
+        kk += 17;
+      }
+
+      for (; kk + 16 <= n; kk += 16) {
+        strip_mine<16>(dev, iRow, kk);
+      }
+
+      if(kk < n) {
+        switch(n - kk) {
+        case 15:
+          strip_mine<15>(dev, iRow, kk);
+          break;
+
+        case 14:
+          strip_mine<14>(dev, iRow, kk);
+          break;
+
+        case 13:
+          strip_mine<13>(dev, iRow, kk);
+          break;
+
+        case 12:
+          strip_mine<12>(dev, iRow, kk);
+          break;
+
+        case 11:
+          strip_mine<11>(dev, iRow, kk);
+          break;
+
+        case 10:
+          strip_mine<10>(dev, iRow, kk);
+          break;
+
+        case 9:
+          strip_mine<9>(dev, iRow, kk);
+          break;
+
+        case 8:
+          strip_mine<8>(dev, iRow, kk);
+          break;
+#  endif // __CUDA_ARCH__
+        case 7:
+          strip_mine<7>(dev, iRow, kk);
+          break;
+
+        case 6:
+          strip_mine<6>(dev, iRow, kk);
+          break;
+
+        case 5:
+          strip_mine<5>(dev, iRow, kk);
+          break;
+
+        case 4:
+          strip_mine<4>(dev, iRow, kk);
+          break;
+
+        case 3:
+          strip_mine<3>(dev, iRow, kk);
+          break;
+
+        case 2:
+          strip_mine<2>(dev, iRow, kk);
+          break;
+
+        case 1:
+          strip_mine_1(dev, iRow);
+          break;
+        }
+      }
+#endif // KOKKOS_FAST_COMPILE
     }
   }
 };
@@ -749,7 +833,8 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a
     const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
 
     int vector_length = 1;
-    while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
+    if(KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>())
+      while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
 
 #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
 
@@ -939,88 +1024,3 @@ spmv_alpha_mv (const char mode[],
 }}  //namespace KokkosSparse::Impl
 
 #endif // KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_
-      /*
-#else
-#  ifdef __CUDA_ARCH__
-      if ((n > 8) && (n % 8 == 1)) {
-        strip_mine<9>(dev, iRow, kk);
-        kk += 9;
-      }
-      for(; kk + 8 <= n; kk += 8)
-        strip_mine<8>(dev, iRow, kk);
-      if(kk < n) {
-        switch(n - kk) {
-#  else // NOT a CUDA device
-      if ((n > 16) && (n % 16 == 1)) {
-        strip_mine<17>(dev, iRow, kk);
-        kk += 17;
-      }
-
-      for (; kk + 16 <= n; kk += 16) {
-        strip_mine<16>(dev, iRow, kk);
-      }
-
-      if(kk < n) {
-        switch(n - kk) {
-        case 15:
-          strip_mine<15>(dev, iRow, kk);
-          break;
-
-        case 14:
-          strip_mine<14>(dev, iRow, kk);
-          break;
-
-        case 13:
-          strip_mine<13>(dev, iRow, kk);
-          break;
-
-        case 12:
-          strip_mine<12>(dev, iRow, kk);
-          break;
-
-        case 11:
-          strip_mine<11>(dev, iRow, kk);
-          break;
-
-        case 10:
-          strip_mine<10>(dev, iRow, kk);
-          break;
-
-        case 9:
-          strip_mine<9>(dev, iRow, kk);
-          break;
-
-        case 8:
-          strip_mine<8>(dev, iRow, kk);
-          break;
-#  endif // __CUDA_ARCH__
-        case 7:
-          strip_mine<7>(dev, iRow, kk);
-          break;
-
-        case 6:
-          strip_mine<6>(dev, iRow, kk);
-          break;
-
-        case 5:
-          strip_mine<5>(dev, iRow, kk);
-          break;
-
-        case 4:
-          strip_mine<4>(dev, iRow, kk);
-          break;
-
-        case 3:
-          strip_mine<3>(dev, iRow, kk);
-          break;
-
-        case 2:
-          strip_mine<2>(dev, iRow, kk);
-          break;
-
-        case 1:
-          strip_mine_1(dev, iRow);
-          break;
-        }
-#endif // KOKKOS_FAST_COMPILE
-  */
diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
index 3575f87dca..f4fa9ea1cd 100644
--- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
@@ -1118,14 +1118,99 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
       // needs to have the same type as n.
       ordinal_type kk = 0;
 
-//#ifdef KOKKOS_FAST_COMPILE
+#ifdef KOKKOS_FAST_COMPILE
       for (; kk + 4 <= n; kk += 4) {
         strip_mine<4>(dev, iRow, kk);
       }
       for( ; kk < n; ++kk) {
         strip_mine<1>(dev, iRow, kk);
       }
-      //BMK: HERE
+#else
+#  ifdef __CUDA_ARCH__
+      if ((n > 8) && (n % 8 == 1)) {
+        strip_mine<9>(dev, iRow, kk);
+        kk += 9;
+      }
+      for(; kk + 8 <= n; kk += 8)
+        strip_mine<8>(dev, iRow, kk);
+      if(kk < n)
+      {
+        switch(n - kk) {
+#  else // NOT a CUDA device
+      if ((n > 16) && (n % 16 == 1)) {
+        strip_mine<17>(dev, iRow, kk);
+        kk += 17;
+      }
+
+      for (; kk + 16 <= n; kk += 16) {
+        strip_mine<16>(dev, iRow, kk);
+      }
+
+      if(kk < n)
+      {
+        switch(n - kk) {
+          case 15:
+            strip_mine<15>(dev, iRow, kk);
+            break;
+
+          case 14:
+            strip_mine<14>(dev, iRow, kk);
+            break;
+
+          case 13:
+            strip_mine<13>(dev, iRow, kk);
+            break;
+
+          case 12:
+            strip_mine<12>(dev, iRow, kk);
+            break;
+
+          case 11:
+            strip_mine<11>(dev, iRow, kk);
+            break;
+
+          case 10:
+            strip_mine<10>(dev, iRow, kk);
+            break;
+
+          case 9:
+            strip_mine<9>(dev, iRow, kk);
+            break;
+
+          case 8:
+            strip_mine<8>(dev, iRow, kk);
+            break;
+  #endif // __CUDA_ARCH__
+          case 7:
+            strip_mine<7>(dev, iRow, kk);
+            break;
+
+          case 6:
+            strip_mine<6>(dev, iRow, kk);
+            break;
+
+          case 5:
+            strip_mine<5>(dev, iRow, kk);
+            break;
+
+          case 4:
+            strip_mine<4>(dev, iRow, kk);
+            break;
+
+          case 3:
+            strip_mine<3>(dev, iRow, kk);
+            break;
+
+          case 2:
+            strip_mine<2>(dev, iRow, kk);
+            break;
+
+          case 1:
+            strip_mine_1(dev, iRow);
+            break;
+        }
+      }
+#endif // KOKKOS_FAST_COMPILE
     }
   }
 };
@@ -1427,91 +1512,3 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
 }
 
 #endif // KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_
-      /*
-#else
-#  ifdef __CUDA_ARCH__
-      if ((n > 8) && (n % 8 == 1)) {
-        strip_mine<9>(dev, iRow, kk);
-        kk += 9;
-      }
-      for(; kk + 8 <= n; kk += 8)
-        strip_mine<8>(dev, iRow, kk);
-      if(kk < n)
-      {
-        switch(n - kk) {
-#  else // NOT a CUDA device
-      if ((n > 16) && (n % 16 == 1)) {
-        strip_mine<17>(dev, iRow, kk);
-        kk += 17;
-      }
-
-      for (; kk + 16 <= n; kk += 16) {
-        strip_mine<16>(dev, iRow, kk);
-      }
-
-      if(kk < n)
-      {
-        switch(n - kk) {
-          case 15:
-            strip_mine<15>(dev, iRow, kk);
-            break;
-
-          case 14:
-            strip_mine<14>(dev, iRow, kk);
-            break;
-
-          case 13:
-            strip_mine<13>(dev, iRow, kk);
-            break;
-
-          case 12:
-            strip_mine<12>(dev, iRow, kk);
-            break;
-
-          case 11:
-            strip_mine<11>(dev, iRow, kk);
-            break;
-
-          case 10:
-            strip_mine<10>(dev, iRow, kk);
-            break;
-
-          case 9:
-            strip_mine<9>(dev, iRow, kk);
-            break;
-
-          case 8:
-            strip_mine<8>(dev, iRow, kk);
-            break;
-  #endif // __CUDA_ARCH__
-          case 7:
-            strip_mine<7>(dev, iRow, kk);
-            break;
-
-          case 6:
-            strip_mine<6>(dev, iRow, kk);
-            break;
-
-          case 5:
-            strip_mine<5>(dev, iRow, kk);
-            break;
-
-          case 4:
-            strip_mine<4>(dev, iRow, kk);
-            break;
-
-          case 3:
-            strip_mine<3>(dev, iRow, kk);
-            break;
-
-          case 2:
-            strip_mine<2>(dev, iRow, kk);
-            break;
-
-          case 1:
-            strip_mine_1(dev, iRow);
-            break;
-        }
-      }
-#endif // KOKKOS_FAST_COMPILE
-    */

From f993534289950c566205e25fded0236e80c379d6 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Fri, 9 Oct 2020 13:06:41 -0600
Subject: [PATCH 03/18] Removed #pragma unroll

Used to be a normal for loop, now it's a ThreadVectorRange
---
 src/sparse/impl/KokkosSparse_spmv_impl.hpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
index 4645a08b63..1d2f737fa6 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -565,15 +565,6 @@ struct SPMV_MV_LayoutLeft_Functor {
     // assume either that rows have no duplicate entries, or that rows
     // never have enough duplicate entries to overflow ordinal_type.
 
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
-#pragma unroll
-#endif
-#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
-#pragma loop count (15)
-#endif
     Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length),
     [&](ordinal_type iEntry)
     {

From 5e0b1191d97e43902b12205a71b9f49958c3ef32 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Mon, 12 Oct 2020 10:18:54 -0600
Subject: [PATCH 04/18] Update for deprecated removal

---
 .../impl/KokkosSparse_spmv_struct_impl.hpp    | 73 -------------------
 1 file changed, 73 deletions(-)

diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
index f4fa9ea1cd..be563c5257 100644
--- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
@@ -1259,7 +1259,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
 
       typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb
         // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
         // instead of int?  For example, if the number of threads is 1,
         // then this is just the number of rows.  Ditto for rows_per_team.
@@ -1271,23 +1270,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
         const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
         Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,NoTranspose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
                              ( nteams , team_size , vector_length ) , op );
-=======
-      // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-      // instead of int?  For example, if the number of threads is 1,
-      // then this is just the number of rows.  Ditto for rows_per_team.
-      // team_size is a hardware resource thing so it might legitimately
-      // be int.
-      const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
-#else
-      const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-#endif
-      const int rows_per_team = rows_per_thread * team_size;
-      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-      Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,NoTranspose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
-                           ( nteams , team_size , vector_length ) , op );
->>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI
 
 #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
 
@@ -1298,7 +1280,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
 
       OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
 
-<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb
         // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
         // instead of int?  For example, if the number of threads is 1,
         // then this is just the number of rows.  Ditto for rows_per_team.
@@ -1310,24 +1291,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
         const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
         Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,NoTranspose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
                              ( nteams , team_size , vector_length ) , op );
-=======
-      // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-      // instead of int?  For example, if the number of threads is 1,
-      // then this is just the number of rows.  Ditto for rows_per_team.
-      // team_size is a hardware resource thing so it might legitimately
-      // be int.
-      const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
-#else
-      const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-#endif
-      const int rows_per_team = rows_per_thread * team_size;
-      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-      Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,NoTranspose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-                           ( nteams , team_size , vector_length ) , op );
->>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI
-
 #endif // KOKKOS_FAST_COMPILE
     }
   }
@@ -1376,7 +1339,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
 
       typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb
         // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
         // instead of int?  For example, if the number of threads is 1,
         // then this is just the number of rows.  Ditto for rows_per_team.
@@ -1388,23 +1350,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
         const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
         Kokkos::parallel_for ("KokkosSparse::spmv_struct<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
                               ( nteams , team_size , vector_length ) , op );
-=======
-      // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-      // instead of int?  For example, if the number of threads is 1,
-      // then this is just the number of rows.  Ditto for rows_per_team.
-      // team_size is a hardware resource thing so it might legitimately
-      // be int.
-      const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
-#else
-      const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-#endif
-      const int rows_per_team = rows_per_thread * team_size;
-      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-      Kokkos::parallel_for ("KokkosSparse::spmv_struct<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-                            ( nteams , team_size , vector_length ) , op );
->>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI
 
 #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
 
@@ -1415,7 +1360,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
 
       OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
 
-<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb
         // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
         // instead of int?  For example, if the number of threads is 1,
         // then this is just the number of rows.  Ditto for rows_per_team.
@@ -1427,23 +1371,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
         const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
         Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
                              ( nteams , team_size , vector_length ) , op );
-=======
-      // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-      // instead of int?  For example, if the number of threads is 1,
-      // then this is just the number of rows.  Ditto for rows_per_team.
-      // team_size is a hardware resource thing so it might legitimately
-      // be int.
-      const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
-      const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length);
-#else
-      const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-#endif
-      const int rows_per_team = rows_per_thread * team_size;
-      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-      Kokkos::parallel_for("KokkosSparse::spmv_struct<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-                           ( nteams , team_size , vector_length ) , op );
->>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI
 
 #endif // KOKKOS_FAST_COMPILE
     }

From 5315d0f9b0a478f9e799f1195709134668918025 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Tue, 13 Oct 2020 18:18:25 -0700
Subject: [PATCH 05/18] Fix SpMV transpose functors

---
 src/sparse/impl/KokkosSparse_spmv_impl.hpp | 52 ++++++++++------------
 1 file changed, 23 insertions(+), 29 deletions(-)

diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
index 1d2f737fa6..86b342647b 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -81,7 +81,6 @@ struct GetCoeffView<Kokkos::View<IT*,IL,ID,IM,IS>,DeviceType> {
 template<class AMatrix,
          class XVector,
          class YVector,
-         int dobeta,
          bool conjugate>
 struct SPMV_Transpose_Functor {
   typedef typename AMatrix::execution_space            execution_space;
@@ -96,32 +95,26 @@ struct SPMV_Transpose_Functor {
   const coefficient_type alpha;
   AMatrix m_A;
   XVector m_x;
-  const coefficient_type beta;
   YVector m_y;
-  const ordinal_type rows_per_thread;
+  ordinal_type rows_per_team;
 
   SPMV_Transpose_Functor (const coefficient_type& alpha_,
                           const AMatrix& m_A_,
                           const XVector& m_x_,
-                          const coefficient_type& beta_,
-                          const YVector& m_y_,
-                          const ordinal_type rows_per_thread_) :
-    alpha (alpha_), m_A (m_A_), m_x (m_x_),
-    beta (beta_), m_y (m_y_),
-    rows_per_thread (rows_per_thread_)
+                          const YVector& m_y_) :
+    alpha (alpha_), m_A (m_A_), m_x (m_x_), m_y (m_y_)
   {}
 
   KOKKOS_INLINE_FUNCTION void
   operator() (const team_member& dev) const
   {
-    const ordinal_type threadWork = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank()))
-      * rows_per_thread;
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread),
+    const ordinal_type teamWork = dev.league_rank() * rows_per_team;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_team),
     [&](ordinal_type loop)
     {
       // iRow represents a row of the matrix, so its correct type is
       // ordinal_type.
-      const ordinal_type iRow = threadWork + loop;
+      const ordinal_type iRow = teamWork + loop;
       if (iRow >= m_A.numRows ()) {
         return;
       }
@@ -366,18 +359,18 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha,
   const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
 
   int vector_length = 1;
-  if(KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>())
-    while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2;
+  while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2;
 
-  typedef SPMV_Transpose_Functor<AMatrix, XVector, YVector, dobeta, conjugate> OpType;
+  typedef SPMV_Transpose_Functor<AMatrix, XVector, YVector, conjugate> OpType;
 
   typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-  OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+  OpType op (alpha, A, x, y);
 
   const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space > (NNZPerRow);
   const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
   const int rows_per_team = rows_per_thread * team_size;
+  op.rows_per_team = rows_per_team;
   const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
   Kokkos::parallel_for("KokkosSparse::spmv<Transpose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
      ( nteams , team_size , vector_length ) , op );
@@ -444,30 +437,27 @@ struct SPMV_MV_Transpose_Functor {
   YVector m_y;
 
   const ordinal_type n;
-  const ordinal_type rows_per_thread;
+  ordinal_type rows_per_team;
 
   SPMV_MV_Transpose_Functor (const coefficient_type& alpha_,
                              const AMatrix& m_A_,
                              const XVector& m_x_,
                              const coefficient_type& beta_,
-                             const YVector& m_y_,
-                             const ordinal_type rows_per_thread_) :
+                             const YVector& m_y_) :
     alpha (alpha_),
-    m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)),
-    rows_per_thread (rows_per_thread_)
+    m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1))
   {}
 
   KOKKOS_INLINE_FUNCTION void
   operator() (const team_member& dev) const
   {
-    const ordinal_type threadWork = (static_cast<ordinal_type> (dev.league_rank() * dev.team_size() + dev.team_rank()))
-      * rows_per_thread;
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread),
+    const ordinal_type teamWork = dev.league_rank() * rows_per_team;
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_team),
     [&](ordinal_type loop)
     {
       // iRow represents a row of the matrix, so its correct type is
       // ordinal_type.
-      const ordinal_type iRow = threadWork + loop;
+      const ordinal_type iRow = teamWork + loop;
       if (iRow >= m_A.numRows ()) {
         return;
       }
@@ -906,13 +896,15 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
     const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
 
     int vector_length = 1;
-    while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
+    //Transpose functor uses atomics which can't be vectorized on CPU
+    if(KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>())
+      while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
 
 #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
 
     typedef SPMV_MV_Transpose_Functor<AMatrix, XVector, YVector,
       doalpha, dobeta, conjugate> OpType;
-    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+    OpType op (alpha, A, x, beta, y);
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
 
@@ -924,6 +916,7 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
     const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
     const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
     const int rows_per_team = rows_per_thread * team_size;
+    op.rows_per_team = rows_per_team;
     const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
     Kokkos::parallel_for ("KokkosSparse::spmv<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
        ( nteams , team_size , vector_length ) , op );
@@ -935,7 +928,7 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-    OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow));
+    OpType op (alpha, A, x, beta, y);
 
     // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
     // instead of int?  For example, if the number of threads is 1,
@@ -945,6 +938,7 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
     const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
     const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
     const int rows_per_team = rows_per_thread * team_size;
+    op.rows_per_team = rows_per_team;
     const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
     Kokkos::parallel_for("KokkosSparse::spmv<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
        ( nteams , team_size , vector_length ) , op );

From da542886d32819e9b02c0e684a7f1b23c2ce0ce5 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Wed, 14 Oct 2020 10:42:52 -0700
Subject: [PATCH 06/18] Add back D1 default algorithm verbose output

---
 src/graph/KokkosGraph_Distance1ColorHandle.hpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp
index e85412abb6..503c6c9310 100644
--- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp
+++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp
@@ -237,9 +237,19 @@ class GraphColoringHandle
   {
     auto exec = KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>();
     if(exec == KokkosKernels::Impl::Exec_SERIAL)
+    {
       this->coloring_algorithm_type = COLORING_SERIAL;
+#ifdef VERBOSE 
+      std:cout << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n";
+#endif
+    }
     else
+    {
       this->coloring_algorithm_type = COLORING_VBBIT;
+#ifdef VERBOSE 
+      std:cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VBBIT\n";
+#endif
+    }
   }
 
   template<typename v1, typename v2, typename v3>

From b5349f110f27f96f2d4261fdeec8b6072c644ea1 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Wed, 14 Oct 2020 10:43:21 -0700
Subject: [PATCH 07/18] Fix HIP device code macros

It's __HIP_DEVICE_COMPILE__, not __CUDA_ARCH__.
---
 perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp    | 2 +-
 .../do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp   | 2 +-
 .../do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp      | 2 +-
 .../do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp      | 2 +-
 .../do-not-use/KokkosBatched_Test_LU_Host_Real.cpp        | 2 +-
 .../do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp      | 2 +-
 src/batched/KokkosBatched_Vector_SIMD.hpp                 | 4 ++--
 src/batched/KokkosBatched_Vector_SIMD_Arith.hpp           | 8 ++++----
 src/blas/impl/KokkosBlas3_gemm_impl.hpp                   | 8 ++++----
 src/common/KokkosKernels_BitUtils.hpp                     | 5 ++---
 src/common/KokkosKernels_SparseUtils.hpp                  | 1 -
 src/sparse/impl/KokkosSparse_spmv_impl.hpp                | 6 +++---
 src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp         | 4 ++--
 unit_test/batched/Test_Batched_SerialTrmm.hpp             | 2 +-
 unit_test/batched/Test_Batched_SerialTrtri.hpp            | 2 +-
 unit_test/blas/Test_Blas3_gemm.hpp                        | 2 +-
 unit_test/blas/Test_Blas3_trmm.hpp                        | 2 +-
 unit_test/blas/Test_Blas3_trsm.hpp                        | 3 ++-
 unit_test/blas/Test_Blas_trtri.hpp                        | 2 +-
 19 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp
index f37c2d1b6f..ac8abb18f7 100644
--- a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp
+++ b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp
@@ -51,7 +51,7 @@ using namespace KokkosBatched;
 int main (int argc, char *argv[]) {
   Kokkos::initialize(argc, argv); 
 
-#if !defined(__CUDA_ARCH__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   typedef Kokkos::DefaultHostExecutionSpace HostSpaceType;
   const bool detail = false;
 
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp
index adff41c48b..2fffa06855 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp
@@ -29,7 +29,7 @@ int main(int argc, char *argv[]) {
   
   Kokkos::initialize(argc, argv);
 
-#if !defined(__CUDA_ARCH__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   const int ntest = 1;
   //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 };
   int N[1] = { 128*128 };
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp
index 7bb2a2907c..031909d540 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp
@@ -27,7 +27,7 @@ void run(const int N) {
 int main(int argc, char *argv[]) {
   
   Kokkos::initialize(argc, argv);
-#if !defined(__CUDA_ARCH__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   const int ntest = 1;
   //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 };
   int N[1] = { 128*128 };
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp
index 8468800ee6..56ade7a446 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp
@@ -27,7 +27,7 @@ void run(const int N) {
 int main(int argc, char *argv[]) {
   
   Kokkos::initialize(argc, argv);
-#if !defined(__CUDA_ARCH__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   const int ntest = 1;
   //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 };
   const int N[1] = { 128*128 };
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp
index 7b39c624f2..7d352283c6 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp
@@ -21,7 +21,7 @@ int main(int argc, char *argv[]) {
 
   Kokkos::initialize(argc, argv);
 
-#if !defined(__CUDA_ARCH__) 
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   int N = 128*128;
 
   for (int i=1;i<argc;++i) {
diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp
index 1afeecd540..bb82e0e56d 100644
--- a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp
+++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp
@@ -61,7 +61,7 @@ void run(const int N) {
 int main(int argc, char *argv[]) {
 
   Kokkos::initialize(argc, argv);
-#if !defined(__CUDA_ARCH__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
   int N = 128*128;
 
   for (int i=1;i<argc;++i) {
diff --git a/src/batched/KokkosBatched_Vector_SIMD.hpp b/src/batched/KokkosBatched_Vector_SIMD.hpp
index e8fe83b7e2..a950e5e41f 100644
--- a/src/batched/KokkosBatched_Vector_SIMD.hpp
+++ b/src/batched/KokkosBatched_Vector_SIMD.hpp
@@ -6,7 +6,7 @@
 #include <Kokkos_Complex.hpp>
 #include <KokkosBatched_Vector.hpp>
 
-#if defined(__CUDA_ARCH__) 
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 #undef  __KOKKOSBATCHED_ENABLE_AVX__
 #else
 // compiler bug with AVX in some architectures
@@ -129,7 +129,7 @@ namespace KokkosBatched {
 }
 
 
-#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP))
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 namespace KokkosBatched {
 
   template<>
diff --git a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp
index 43ddbb101b..49317ca9d4 100644
--- a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp
+++ b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp
@@ -77,7 +77,7 @@ namespace KokkosBatched {
     return r_val;
   }
     
-#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP))
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   KOKKOS_FORCEINLINE_FUNCTION 
   static
   KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
@@ -298,7 +298,7 @@ namespace KokkosBatched {
     return r_val;
   }
 
-#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP))
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   KOKKOS_FORCEINLINE_FUNCTION
   static 
   KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
@@ -568,7 +568,7 @@ namespace KokkosBatched {
     return r_val;
   }
 
-#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP))
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   KOKKOS_FORCEINLINE_FUNCTION
   static 
   KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
@@ -858,7 +858,7 @@ namespace KokkosBatched {
     return r_val;
   }
 
-#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP))
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   KOKKOS_FORCEINLINE_FUNCTION
   static 
   KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2)
diff --git a/src/blas/impl/KokkosBlas3_gemm_impl.hpp b/src/blas/impl/KokkosBlas3_gemm_impl.hpp
index 2e50a0064c..fc5ba4dfa6 100644
--- a/src/blas/impl/KokkosBlas3_gemm_impl.hpp
+++ b/src/blas/impl/KokkosBlas3_gemm_impl.hpp
@@ -64,20 +64,20 @@ namespace Impl {
 // On GPUs it is more important to not jump around in global memory, i.e. have coallesced loads
 template<class ExecSpace, class LayoutA, class LayoutAScratch>
 struct impl_gemm_choose_copy_layout {
-  typedef LayoutAScratch type;
+  using type = LayoutAScratch;
 };
 
 #ifdef KOKKOS_ENABLE_CUDA
 template<class LayoutA, class LayoutAScratch>
 struct impl_gemm_choose_copy_layout<Kokkos::Cuda,LayoutA,LayoutAScratch> {
-  typedef LayoutA type;
+  using type = LayoutA;
 };
 #endif
 
 #ifdef KOKKOS_ENABLE_HIP
 template<class LayoutA, class LayoutAScratch>
 struct impl_gemm_choose_copy_layout<Kokkos::Experimental::HIP,LayoutA,LayoutAScratch> {
-  typedef LayoutA type;
+  using type = LayoutA;
 };
 #endif
 
@@ -399,7 +399,7 @@ KOKKOS_INLINE_FUNCTION
 void impl_team_gemm_block(const TeamHandle& team, const ViewTypeC& C, const ViewTypeA& A, const ViewTypeB& B) {
   typedef typename ViewTypeC::non_const_value_type ScalarC;
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) || !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_COMPILER_GNU) && (!defined(__CUDA_ARCH__) || !defined(__HIP_DEVICE_COMPILE__))
   int blockA0 = A.extent_int(0);
   int blockA1 = A.extent_int(1);
   int blockB1 = B.extent_int(1);
diff --git a/src/common/KokkosKernels_BitUtils.hpp b/src/common/KokkosKernels_BitUtils.hpp
index 28b2a01389..4d09fb964e 100644
--- a/src/common/KokkosKernels_BitUtils.hpp
+++ b/src/common/KokkosKernels_BitUtils.hpp
@@ -51,8 +51,7 @@ namespace KokkosKernels{
 namespace Impl{
 
 // POP COUNT function returns the number of set bits
-// Note BMK: HIP also defines __CUDA_ARCH__, and provides the same intrinsics.
-#if defined( __CUDA_ARCH__ )
+#if defined( __CUDA_ARCH__ ) || defined(__HIP_DEVICE_COMPILE__)
 KOKKOS_FORCEINLINE_FUNCTION
 int pop_count( unsigned i ){
   return __popc(i);
@@ -182,7 +181,7 @@ int pop_count(  long long i ){
 
 // least_set_bit function returns the position of right most set bit
 
-#if defined( __CUDA_ARCH__ )
+#if defined( __CUDA_ARCH__ ) || defined(__HIP_DEVICE_COMPILE__)
 KOKKOS_FORCEINLINE_FUNCTION
 int least_set_bit( unsigned i ){
   return __ffs(i);
diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp
index 02ab3a50b7..6979f15847 100644
--- a/src/common/KokkosKernels_SparseUtils.hpp
+++ b/src/common/KokkosKernels_SparseUtils.hpp
@@ -1341,7 +1341,6 @@ void kk_sort_graph(
 
     out_nnz_view_t out_adj,
     out_scalar_view_t out_vals){
-  // If possible, sort on host and avoid a deep copy
   // TODO BMK: can this function be deprecated?
   typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj);
   Kokkos::deep_copy (hr, in_xadj);
diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
index 86b342647b..1c011e42d9 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -693,7 +693,7 @@ struct SPMV_MV_LayoutLeft_Functor {
         strip_mine<1>(dev, iRow, kk);
       }
 #else
-#  ifdef __CUDA_ARCH__
+#  if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
       if ((n > 8) && (n % 8 == 1)) {
         strip_mine<9>(dev, iRow, kk);
         kk += 9;
@@ -702,7 +702,7 @@ struct SPMV_MV_LayoutLeft_Functor {
         strip_mine<8>(dev, iRow, kk);
       if(kk < n) {
         switch(n - kk) {
-#  else // NOT a CUDA device
+#  else // NOT a GPU
       if ((n > 16) && (n % 16 == 1)) {
         strip_mine<17>(dev, iRow, kk);
         kk += 17;
@@ -745,7 +745,7 @@ struct SPMV_MV_LayoutLeft_Functor {
         case 8:
           strip_mine<8>(dev, iRow, kk);
           break;
-#  endif // __CUDA_ARCH__
+#  endif // if/else: __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__
         case 7:
           strip_mine<7>(dev, iRow, kk);
           break;
diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
index be563c5257..3179a0cc31 100644
--- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp
@@ -1126,7 +1126,7 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
         strip_mine<1>(dev, iRow, kk);
       }
 #else
-#  ifdef __CUDA_ARCH__
+#  if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
       if ((n > 8) && (n % 8 == 1)) {
         strip_mine<9>(dev, iRow, kk);
         kk += 9;
@@ -1180,7 +1180,7 @@ struct SPMV_MV_Struct_LayoutLeft_Functor {
           case 8:
             strip_mine<8>(dev, iRow, kk);
             break;
-  #endif // __CUDA_ARCH__
+  #endif // __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__
           case 7:
             strip_mine<7>(dev, iRow, kk);
             break;
diff --git a/unit_test/batched/Test_Batched_SerialTrmm.hpp b/unit_test/batched/Test_Batched_SerialTrmm.hpp
index 8f8fd48758..3301f3cd42 100644
--- a/unit_test/batched/Test_Batched_SerialTrmm.hpp
+++ b/unit_test/batched/Test_Batched_SerialTrmm.hpp
@@ -54,7 +54,7 @@ namespace Test {
     KOKKOS_INLINE_FUNCTION
     void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
       int i = team.league_rank();
 #else
       const int i = team.league_rank();
diff --git a/unit_test/batched/Test_Batched_SerialTrtri.hpp b/unit_test/batched/Test_Batched_SerialTrtri.hpp
index c50e26ae35..f4f74d6b7c 100644
--- a/unit_test/batched/Test_Batched_SerialTrtri.hpp
+++ b/unit_test/batched/Test_Batched_SerialTrtri.hpp
@@ -56,7 +56,7 @@ namespace Test {
     KOKKOS_INLINE_FUNCTION
     void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
       int i = team.league_rank();
 #else
       const int i = team.league_rank();
diff --git a/unit_test/blas/Test_Blas3_gemm.hpp b/unit_test/blas/Test_Blas3_gemm.hpp
index 55c71231f6..451b7fedac 100644
--- a/unit_test/blas/Test_Blas3_gemm.hpp
+++ b/unit_test/blas/Test_Blas3_gemm.hpp
@@ -25,7 +25,7 @@ namespace Test {
     KOKKOS_INLINE_FUNCTION
     void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
       int i = team.league_rank();
 #else
       const int i = team.league_rank();
diff --git a/unit_test/blas/Test_Blas3_trmm.hpp b/unit_test/blas/Test_Blas3_trmm.hpp
index 74fd49b988..9f72bd5e63 100644
--- a/unit_test/blas/Test_Blas3_trmm.hpp
+++ b/unit_test/blas/Test_Blas3_trmm.hpp
@@ -49,7 +49,7 @@ namespace Test {
     KOKKOS_INLINE_FUNCTION
     void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
       int i = team.league_rank();
 #else
       const int i = team.league_rank();
diff --git a/unit_test/blas/Test_Blas3_trsm.hpp b/unit_test/blas/Test_Blas3_trsm.hpp
index e6e98723c2..8fec44b637 100644
--- a/unit_test/blas/Test_Blas3_trsm.hpp
+++ b/unit_test/blas/Test_Blas3_trsm.hpp
@@ -49,7 +49,8 @@ namespace Test {
     KOKKOS_INLINE_FUNCTION
     void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+
       int i = team.league_rank();
 #else
       const int i = team.league_rank();
diff --git a/unit_test/blas/Test_Blas_trtri.hpp b/unit_test/blas/Test_Blas_trtri.hpp
index f939b87b31..bcc6b842c8 100644
--- a/unit_test/blas/Test_Blas_trtri.hpp
+++ b/unit_test/blas/Test_Blas_trtri.hpp
@@ -49,7 +49,7 @@ namespace Test {
     KOKKOS_INLINE_FUNCTION
     void operator() (const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
 // GNU COMPILER BUG WORKAROUND
-#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__)
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
       int i = team.league_rank();
 #else
       const int i = team.league_rank();

From 9a9ec3443b8be533f35d9baf4ca865f1ae9e7741 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Wed, 14 Oct 2020 10:52:53 -0700
Subject: [PATCH 08/18] Restore d2 coloring verbose about default algo

---
 src/graph/KokkosGraph_Distance2ColorHandle.hpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/graph/KokkosGraph_Distance2ColorHandle.hpp b/src/graph/KokkosGraph_Distance2ColorHandle.hpp
index 4c392051fb..39d66b744f 100644
--- a/src/graph/KokkosGraph_Distance2ColorHandle.hpp
+++ b/src/graph/KokkosGraph_Distance2ColorHandle.hpp
@@ -206,9 +206,19 @@ class GraphColorDistance2Handle
     void choose_default_algorithm()
     {
         if(KokkosKernels::Impl::kk_get_exec_space_type<ExecutionSpace>() == KokkosKernels::Impl::Exec_SERIAL)
+        {
             this->coloring_algorithm_type = COLORING_D2_SERIAL;
+#ifdef VERBOSE 
+            std:cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n";
+#endif
+        }
         else
+        {
             this->coloring_algorithm_type = COLORING_D2_NB_BIT;
+#ifdef VERBOSE 
+            std:cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n";
+#endif
+        }
     }
 
 

From 2c3e3a46750f21d7d1b268e47645da9089b2ca54 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Wed, 14 Oct 2020 11:01:38 -0700
Subject: [PATCH 09/18] Fix indent

---
 src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
index 29dbb5c477..ec0c2034a2 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
@@ -1508,7 +1508,7 @@ void KokkosSPGEMM
 		current_spgemm_algorithm = SPGEMM_KK_MEMORY;
 	}
 	maxNumRoughNonzeros = KOKKOSKERNELS_MACRO_MIN(this->b_col_cnt, maxNumRoughNonzeros);
-        int shmem_size_to_use = shmem_size;
+	int shmem_size_to_use = shmem_size;
 
 	typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space;
 

From d2448f2943052642a03a50c95588e1728283dd40 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Wed, 14 Oct 2020 12:48:25 -0700
Subject: [PATCH 10/18] Factor out pool #chunks computation for SpGEMM

(same code used in 7 places)
---
 src/sparse/impl/KokkosSparse_spgemm_impl.hpp  | 27 ++++++++++++
 .../KokkosSparse_spgemm_impl_compression.hpp  | 21 +--------
 .../impl/KokkosSparse_spgemm_impl_kkmem.hpp   | 20 +--------
 .../KokkosSparse_spgemm_impl_symbolic.hpp     | 43 ++-----------------
 .../KokkosSparse_spgemm_impl_triangle.hpp     | 21 ++-------
 ...se_spgemm_impl_triangle_no_compression.hpp | 19 ++------
 ...kosSparse_spgemm_jacobi_sparseacc_impl.hpp | 18 +-------
 7 files changed, 43 insertions(+), 126 deletions(-)

diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
index a8a539ef10..52ae067801 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
@@ -789,6 +789,33 @@ class KokkosSPGEMM{
 
 };
 
+//Utility to compute the number of pool chunks for L2 hashmap accumulators.
+//Uses free memory query for accelerators/GPUs but assumes infinite available host memory.
+//
+//chunk_bytes: bytes in each chunk
+//ideal_num_chunks: number of chunks that would give each thread/team its own chunk (no contention)
+template<typename Pool>
+size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks)
+{
+  if(!KokkosKernels::Impl::kk_is_gpu_exec_space<typename Pool::execution_space>())
+    return ideal_num_chunks;
+  size_t free_byte, total_byte;
+  KokkosKernels::Impl::kk_get_free_total_memory<typename Pool::memory_space>(free_byte, total_byte);
+  size_t required_size = ideal_num_chunks * chunk_bytes;
+  if (KOKKOSKERNELS_VERBOSE)
+    std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
+  size_t num_chunks = ideal_num_chunks;
+  //If there is not enough memory to safely allocate ideal_num_chunks, use half the free memory, rounded down
+  if (required_size > free_byte / 2) {
+    num_chunks = (free_byte / 2) / chunk_bytes;
+  }
+  //then take the largest power of 2 smaller than that
+  nnz_lno_t po2_num_chunks = 1;
+  while (po2_num_chunks * 2 < num_chunks) {
+    po2_num_chunks *= 2;
+  }
+  return po2_num_chunks;
+}
 
 }
 }
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
index c881c98ed4..6936a49f15 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
@@ -860,26 +860,9 @@ bool KokkosSPGEMM
     sszm_compressMatrix.pow2_hash_size = min_hash_size;
     sszm_compressMatrix.pow2_hash_func = min_hash_size - 1;
 
-    size_t num_chunks = concurrency / suggested_vector_size;
+    nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks<pool_memory_space>
+      (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
-
-    if (exec_gpu) {
-      size_t free_byte, total_byte;
-      KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
-      size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
-      if (KOKKOSKERNELS_VERBOSE)
-              std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-      if (required_size + num_chunks*sizeof(int) > free_byte){
-              num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize;
-      }
-      {
-              size_t min_chunk_size = 1;
-              while (min_chunk_size * 2 <= num_chunks) {
-                      min_chunk_size *= 2;
-              }
-              num_chunks = min_chunk_size;
-      }
-    }
     if (KOKKOSKERNELS_VERBOSE){
 
       std::cout << "\t\tPOOL chunksize:" << chunksize << " num_chunks:"
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
index 38fce91b1b..e81b019e15 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
@@ -1421,25 +1421,9 @@ void
 	  chunksize += min_hash_size ; //this is for the hash begins
 	  chunksize += max_nnz; //this is for hash nexts
   }
-  int num_chunks = concurrency / suggested_vector_size;
 
-  if (KokkosKernels::Impl::kk_is_gpu_exec_space<my_exec_space>()) {
-    size_t free_byte, total_byte;
-    KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
-    size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
-    if (KOKKOSKERNELS_VERBOSE)
-      std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-    if (required_size + num_chunks > free_byte){
-      num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize;
-    }
-    {
-      nnz_lno_t min_chunk_size = 1;
-      while (min_chunk_size * 2 <= num_chunks) {
-        min_chunk_size *= 2;
-      }
-      num_chunks = min_chunk_size;
-    }
-  }
+  nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks<pool_memory_space>
+    (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
   // END SIZE CALCULATIONS FOR MEMORYPOOL
 
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
index ec0c2034a2..4eb13d9b5e 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
@@ -1642,30 +1642,13 @@ void KokkosSPGEMM
 	}
 
 	//initizalize value for the mem pool
-	nnz_lno_t num_chunks = concurrency / suggested_vector_size;
 	KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
 	if (exec_gpu) {
 		my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
 	}
 
-
-	if (exec_gpu) {
-            size_t free_byte, total_byte;
-            KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
-            size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
-            if (KOKKOSKERNELS_VERBOSE)
-                    std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-            if (required_size + num_chunks > free_byte){
-                    num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize;
-            }
-            {
-                    nnz_lno_t min_chunk_size = 1;
-                    while (min_chunk_size * 2 <= num_chunks) {
-                            min_chunk_size *= 2;
-                    }
-                    num_chunks = min_chunk_size;
-            }
-	}
+        nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks<pool_memory_space>
+          (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
 	if (KOKKOSKERNELS_VERBOSE){
 		std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl;
@@ -1970,31 +1953,13 @@ void KokkosSPGEMM
       std::cout << "\tDense Acc - COLS:" << col_size << " max_row_size:" << max_row_size << std::endl;
     }
   }
-  nnz_lno_t num_chunks = concurrency / suggested_vector_size;
-
   KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
   if (exec_gpu) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
 
-
-  if (exec_gpu) {
-    size_t free_byte, total_byte;
-    KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
-    size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
-    if (KOKKOSKERNELS_VERBOSE)
-      std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-    if (required_size + num_chunks > free_byte){
-      num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize;
-    }
-    {
-      nnz_lno_t min_chunk_size = 1;
-      while (min_chunk_size * 2 <= num_chunks) {
-        min_chunk_size *= 2;
-      }
-      num_chunks = min_chunk_size;
-    }
-  }
+  nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks<pool_memory_space>
+    (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
   if (KOKKOSKERNELS_VERBOSE){
     std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl;
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
index 27c0f4c7d9..6624343b52 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
@@ -1416,29 +1416,14 @@ void KokkosSPGEMM
 
   }
 
-  nnz_lno_t num_chunks = concurrency / suggested_vector_size;
   KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
   if (exec_gpu) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
 
-  if(exec_gpu) {
-    size_t free_byte, total_byte;
-    KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
-    size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t);
-    if (KOKKOSKERNELS_VERBOSE)
-      std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-    if (required_size + num_chunks > free_byte){
-      num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize;
-    }
-    {
-      nnz_lno_t min_chunk_size = 1;
-      while (min_chunk_size * 2 < num_chunks) {
-        min_chunk_size *= 2;
-      }
-      num_chunks = min_chunk_size;
-    }
-  }
+  nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks<pool_memory_space>
+    (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
+
   if (KOKKOSKERNELS_VERBOSE){
     std::cout <<  "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. <<
         " num_chunks:" << num_chunks <<
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
index ae913f864a..adc75d6eb2 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
@@ -963,26 +963,13 @@ void KokkosSPGEMM
     pool_init_val = 0;
   }
 
-  nnz_lno_t num_chunks = concurrency / suggested_vector_size;
   KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk;
   if (exec_gpu) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
-    size_t free_byte, total_byte;
-    KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
-    size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t);
-    if (KOKKOSKERNELS_VERBOSE)
-      std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-    if (required_size + num_chunks > free_byte){
-      num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize;
-    }
-    {
-      nnz_lno_t min_chunk_size = 1;
-      while (min_chunk_size * 2 < num_chunks) {
-        min_chunk_size *= 2;
-      }
-      num_chunks = min_chunk_size;
-    }
   }
+  nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks<pool_memory_space>
+    (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
+
   if (KOKKOSKERNELS_VERBOSE){
     std::cout <<  "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. <<
         " num_chunks:" << num_chunks <<
diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
index 2e12457822..2140b8dc56 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
@@ -1401,23 +1401,9 @@ namespace KokkosSparse{
 	chunksize += min_hash_size ; //this is for the hash begins
 	chunksize += max_nnz; //this is for hash nexts
       }
-      int num_chunks = concurrency / suggested_vector_size;
 
-      if (exec_gpu) {
-	size_t free_byte, total_byte;
-        KokkosKernels::Impl::kk_get_free_total_memory<typename pool_memory_space::memory_space>(free_byte, total_byte);
-	size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t);
-	if (KOKKOSKERNELS_VERBOSE)
-	  std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-	if (required_size + num_chunks > free_byte){
-	  num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize;
-	}
-	nnz_lno_t min_chunk_size = 1;
-	while (min_chunk_size * 2 <= num_chunks) {
-	  min_chunk_size *= 2;
-	}
-	num_chunks = min_chunk_size;
-      }
+      nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks<pool_memory_space>
+        (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
       if (KOKKOSKERNELS_VERBOSE){
 	std::cout << "\t\t max_nnz: " << max_nnz

From f4cacdc19ccf98c89f22e5d65246cf8782189433 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Wed, 14 Oct 2020 13:10:09 -0700
Subject: [PATCH 11/18] Made compute_num_pool_chunks a member of SpGEMM

---
 src/sparse/impl/KokkosSparse_spgemm_impl.hpp  | 55 +++++++++----------
 .../KokkosSparse_spgemm_impl_compression.hpp  |  2 +-
 .../impl/KokkosSparse_spgemm_impl_kkmem.hpp   |  2 +-
 .../KokkosSparse_spgemm_impl_symbolic.hpp     |  4 +-
 .../KokkosSparse_spgemm_impl_triangle.hpp     |  2 +-
 ...se_spgemm_impl_triangle_no_compression.hpp |  2 +-
 ...kosSparse_spgemm_jacobi_sparseacc_impl.hpp |  2 +-
 7 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
index 52ae067801..19e576eb9d 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
@@ -787,35 +787,34 @@ class KokkosSPGEMM{
 				    typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv,
 				    KokkosKernels::Impl::ExecSpaceType my_exec_space);
 
-};
-
-//Utility to compute the number of pool chunks for L2 hashmap accumulators.
-//Uses free memory query for accelerators/GPUs but assumes infinite available host memory.
-//
-//chunk_bytes: bytes in each chunk
-//ideal_num_chunks: number of chunks that would give each thread/team its own chunk (no contention)
-template<typename Pool>
-size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks)
-{
-  if(!KokkosKernels::Impl::kk_is_gpu_exec_space<typename Pool::execution_space>())
-    return ideal_num_chunks;
-  size_t free_byte, total_byte;
-  KokkosKernels::Impl::kk_get_free_total_memory<typename Pool::memory_space>(free_byte, total_byte);
-  size_t required_size = ideal_num_chunks * chunk_bytes;
-  if (KOKKOSKERNELS_VERBOSE)
-    std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
-  size_t num_chunks = ideal_num_chunks;
-  //If there is not enough memory to safely allocate ideal_num_chunks, use half the free memory, rounded down
-  if (required_size > free_byte / 2) {
-    num_chunks = (free_byte / 2) / chunk_bytes;
+  //Utility to compute the number of pool chunks for L2 hashmap accumulators.
+  //Uses free memory query for accelerators/GPUs but assumes infinite available host memory.
+  //
+  //chunk_bytes: bytes in each chunk
+  //ideal_num_chunks: number of chunks that would give each thread/team its own chunk (no contention)
+  template<typename Pool>
+  size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks)
+  {
+    if(!KokkosKernels::Impl::kk_is_gpu_exec_space<typename Pool::execution_space>())
+      return ideal_num_chunks;
+    size_t free_byte, total_byte;
+    KokkosKernels::Impl::kk_get_free_total_memory<typename Pool::memory_space>(free_byte, total_byte);
+    size_t required_size = ideal_num_chunks * chunk_bytes;
+    if (KOKKOSKERNELS_VERBOSE)
+      std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl;
+    size_t num_chunks = ideal_num_chunks;
+    //If there is not enough memory to safely allocate ideal_num_chunks, use half the free memory, rounded down
+    if (required_size > free_byte / 2) {
+      num_chunks = (free_byte / 2) / chunk_bytes;
+    }
+    //then take the largest power of 2 smaller than that
+    nnz_lno_t po2_num_chunks = 1;
+    while (po2_num_chunks * 2 < num_chunks) {
+      po2_num_chunks *= 2;
+    }
+    return po2_num_chunks;
   }
-  //then take the largest power of 2 smaller than that
-  nnz_lno_t po2_num_chunks = 1;
-  while (po2_num_chunks * 2 < num_chunks) {
-    po2_num_chunks *= 2;
-  }
-  return po2_num_chunks;
-}
+};
 
 }
 }
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
index 6936a49f15..35f00201a2 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp
@@ -860,7 +860,7 @@ bool KokkosSPGEMM
     sszm_compressMatrix.pow2_hash_size = min_hash_size;
     sszm_compressMatrix.pow2_hash_func = min_hash_size - 1;
 
-    nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks<pool_memory_space>
+    nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
       (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
     if (KOKKOSKERNELS_VERBOSE){
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
index e81b019e15..a5fc298e2c 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp
@@ -1422,7 +1422,7 @@ void
 	  chunksize += max_nnz; //this is for hash nexts
   }
 
-  nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks<pool_memory_space>
+  nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
     (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
   // END SIZE CALCULATIONS FOR MEMORYPOOL
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
index 4eb13d9b5e..f6f4e8e3a8 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp
@@ -1647,7 +1647,7 @@ void KokkosSPGEMM
 		my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
 	}
 
-        nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks<pool_memory_space>
+        nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
           (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
 	if (KOKKOSKERNELS_VERBOSE){
@@ -1958,7 +1958,7 @@ void KokkosSPGEMM
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
 
-  nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks<pool_memory_space>
+  nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
     (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
   if (KOKKOSKERNELS_VERBOSE){
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
index 6624343b52..c06d4c4cb2 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp
@@ -1421,7 +1421,7 @@ void KokkosSPGEMM
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
 
-  nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks<pool_memory_space>
+  nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
     (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
   if (KOKKOSKERNELS_VERBOSE){
diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
index adc75d6eb2..6a9b67c0b2 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp
@@ -967,7 +967,7 @@ void KokkosSPGEMM
   if (exec_gpu) {
     my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk;
   }
-  nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks<pool_memory_space>
+  nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
     (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
   if (KOKKOSKERNELS_VERBOSE){
diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
index 2140b8dc56..d4c2c98a6f 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp
@@ -1402,7 +1402,7 @@ namespace KokkosSparse{
 	chunksize += max_nnz; //this is for hash nexts
       }
 
-      nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks<pool_memory_space>
+      nnz_lno_t num_chunks = this->template compute_num_pool_chunks<pool_memory_space>
         (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size);
 
       if (KOKKOSKERNELS_VERBOSE){

From 7aef9b13a1b38e19392074cbe18381802dfad115 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Wed, 14 Oct 2020 13:13:13 -0700
Subject: [PATCH 12/18] Fix signed vs. unsigned

---
 src/sparse/impl/KokkosSparse_spgemm_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
index 19e576eb9d..06a3153ad9 100644
--- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp
@@ -808,7 +808,7 @@ class KokkosSPGEMM{
       num_chunks = (free_byte / 2) / chunk_bytes;
     }
     //then take the largest power of 2 smaller than that
-    nnz_lno_t po2_num_chunks = 1;
+    size_t po2_num_chunks = 1;
     while (po2_num_chunks * 2 < num_chunks) {
       po2_num_chunks *= 2;
     }

From fd94bd47e3fa964a4054c510564fe1a3d83e2472 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Mon, 26 Oct 2020 21:24:56 -0600
Subject: [PATCH 13/18] WIP: improving performance of spmv for openmp

---
 perf_test/sparse/CMakeLists.txt               |   5 +
 perf_test/sparse/KokkosSparse_kk_spmv.cpp     | 186 ++++++++
 src/common/KokkosKernels_ExecSpaceUtils.hpp   |  20 +-
 src/sparse/impl/KokkosSparse_spmv_impl.hpp    | 427 ++++++++++++++----
 .../impl/KokkosSparse_spmv_impl_omp.hpp       |   1 -
 unit_test/sparse/Test_Sparse_spmv.hpp         |  31 ++
 6 files changed, 576 insertions(+), 94 deletions(-)
 create mode 100644 perf_test/sparse/KokkosSparse_kk_spmv.cpp

diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt
index da22993cda..f0662e4a08 100644
--- a/perf_test/sparse/CMakeLists.txt
+++ b/perf_test/sparse/CMakeLists.txt
@@ -43,6 +43,11 @@ KOKKOSKERNELS_ADD_EXECUTABLE(
   SOURCES KokkosSparse_spmv.cpp
   )
 
+KOKKOSKERNELS_ADD_EXECUTABLE(
+  sparse_kk_spmv
+  SOURCES KokkosSparse_kk_spmv.cpp
+  )
+
 IF(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
   KOKKOSKERNELS_ADD_EXECUTABLE(
     sparse_spmv_merge
diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp
new file mode 100644
index 0000000000..07c29e3735
--- /dev/null
+++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp
@@ -0,0 +1,186 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+
+#include <ctime>
+#include <cstring>
+#include <cstdlib>
+#include <limits>
+#include <limits.h>
+#include <cmath>
+#include <unordered_map>
+
+#include <Kokkos_Core.hpp>
+#include <KokkosSparse_CrsMatrix.hpp>
+#include <KokkosKernels_IOUtils.hpp>
+#include <KokkosSparse_spmv.hpp>
+#include "KokkosKernels_default_types.hpp"
+
+typedef default_scalar Scalar;
+typedef default_lno_t Ordinal;
+typedef default_size_type Offset;
+
+template<typename Layout>
+void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, int num_vecs, char mode, Scalar beta) {
+  typedef KokkosSparse::CrsMatrix<Scalar, Ordinal, Kokkos::DefaultExecutionSpace, void, Offset> matrix_type;
+  typedef typename Kokkos::View<Scalar**, Layout> mv_type;
+  typedef typename mv_type::HostMirror h_mv_type;
+
+  srand(17312837);
+  matrix_type A;
+  if(filename)
+    A = KokkosKernels::Impl::read_kokkos_crst_matrix<matrix_type>(filename);
+  else
+  {
+    Offset nnz = 10 * numRows;
+    //note: the help text says the bandwidth is fixed at 0.01 * numRows
+    A = KokkosKernels::Impl::kk_generate_sparse_matrix<matrix_type>(numRows, numCols, nnz, 0, 0.01 * numRows);
+  }
+  numRows = A.numRows();
+  numCols = A.numCols();
+  Offset nnz = A.nnz();
+  mv_type x("X", numCols, num_vecs);
+  mv_type y("Y", numRows, num_vecs);
+  h_mv_type h_x = Kokkos::create_mirror_view(x);
+  h_mv_type h_y = Kokkos::create_mirror_view(y);
+  h_mv_type h_y_compare = Kokkos::create_mirror(y);
+
+  for(int v = 0; v < num_vecs; v++)
+  {
+    for(int i=0; i<numCols;i++)
+    {
+      h_x(i, v) = (Scalar) (1.0*(rand()%40)-20.);
+    }
+  }
+
+  Kokkos::deep_copy(x,h_x);
+
+  // Benchmark
+  auto x0 = Kokkos::subview(x, Kokkos::ALL(), 0);
+  auto y0 = Kokkos::subview(y, Kokkos::ALL(), 0);
+  Kokkos::Timer timer;
+  for(int i=0;i<loop;i++) {
+    if(num_vecs == 1)
+    {
+      //run the rank-1 version
+      KokkosSparse::spmv(&mode,1.0,A,x0,beta,y0);
+    }
+    else
+    {
+      //rank-2
+      KokkosSparse::spmv(&mode,1.0,A,x,beta,y);
+    }
+    Kokkos::DefaultExecutionSpace().fence();
+  }
+  double avg_time = timer.seconds() / loop;
+  std::cout << avg_time << " s\n";
+}
+
+void print_help() {
+  printf("  -s [nrows]            : matrix dimension (square)\n");
+  printf("  --nv n                : number of columns in x/y multivector (default 1).\n");
+  printf("  --layout left|right   : memory layout of x/y. Default depends on build's default execution space\n");
+  printf("  -m N|T                : matrix apply mode: N (normal, default), T (transpose)\n");
+  printf("  -f [file],-fb [file]  : Read in Matrix Market (.mtx), or binary (.bin) matrix file.\n");
+  printf("  -l [LOOP]             : How many spmv to run to aggregate average time. \n");
+  printf("  -b beta               : beta, as in y := Ax + (beta)y\n");
+}
+
+int main(int argc, char **argv)
+{
+ long long int size = 110503; // a prime number
+ char* filename = NULL;
+
+ char mode = 'N';
+ char layout;
+ if(std::is_same<default_layout, Kokkos::LayoutLeft>::value)
+   layout = 'L';
+ else
+   layout = 'R';
+ int loop = 100;
+ int num_vecs = 1;
+ Scalar beta = 0.0;
+
+ if(argc == 1) {
+   print_help();
+   return 0;
+ }
+
+ for(int i=0;i<argc;i++)
+ {
+   if((strcmp(argv[i],"-s")==0)) {size=atoi(argv[++i]); continue;}
+   if((strcmp(argv[i],"-f")==0 || strcmp(argv[i], "-fb") == 0)) {filename = argv[++i]; continue;}
+   if((strcmp(argv[i],"-l")==0)) {loop=atoi(argv[++i]); continue;}
+   if((strcmp(argv[i],"-m")==0)) {mode=toupper(argv[++i][0]); continue;}
+   if((strcmp(argv[i],"--nv")==0)) {num_vecs=atoi(argv[++i]); continue;}
+   if((strcmp(argv[i],"-b")==0)) {beta=atof(argv[++i]); continue;}
+   if((strcmp(argv[i],"--layout")==0))
+   {
+     i++;
+     if(toupper(argv[i][0]) == 'L')
+       layout = 'L';
+     else if(toupper(argv[i][0]) == 'R')
+       layout = 'R';
+     else
+       throw std::runtime_error("Invalid layout");
+   }
+   if((strcmp(argv[i],"--help")==0) || (strcmp(argv[i],"-h")==0)) {
+     print_help();
+     return 0;
+   }
+ }
+
+ Kokkos::initialize(argc,argv);
+
+ std::cout << size << " rows/cols, mode " << mode << ", " << num_vecs << " vectors, beta = " << beta << ", layout " << layout << ": ";
+
+ if(layout == 'L')
+   run_spmv<Kokkos::LayoutLeft>(size,size,filename,loop,num_vecs,mode,beta);
+ else
+   run_spmv<Kokkos::LayoutRight>(size,size,filename,loop,num_vecs,mode,beta);
+
+ Kokkos::finalize();
+}
+
diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp
index 22930c82e1..59bcf487fb 100644
--- a/src/common/KokkosKernels_ExecSpaceUtils.hpp
+++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp
@@ -55,7 +55,7 @@ namespace Impl{
 
 enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA, Exec_HIP};
 template <typename ExecutionSpace>
-constexpr KOKKOS_INLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){
+KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){
   ExecSpaceType exec_space = Exec_SERIAL;
 #if defined( KOKKOS_ENABLE_SERIAL )
   if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){
@@ -98,11 +98,23 @@ constexpr KOKKOS_INLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){
 
 template <typename ExecutionSpace>
 constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() {
-  auto exec = kk_get_exec_space_type<ExecutionSpace>();
-  //TODO BMK: Add OpenMPTarget and any other future GPU exec spaces
-  return exec == Exec_CUDA || exec == Exec_HIP;
+  return false;
 }
 
+#ifdef KOKKOS_ENABLE_CUDA
+template <>
+constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space<Kokkos::Cuda>() {
+  return true;
+}
+#endif
+
+#ifdef KOKKOS_ENABLE_HIP
+template <>
+constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space<Kokkos::Experimental::HIP>() {
+  return true;
+}
+#endif
+
 //Host function to determine free and total device memory.
 //Will throw if execution space doesn't support this.
 template <typename MemorySpace>
diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
index 1c011e42d9..558acc363a 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -171,10 +171,38 @@ struct SPMV_Functor {
                    "YVector must be a rank 1 View.");
   }
 
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const ordinal_type iRow) const
+  {
+    using y_value_type = typename YVector::non_const_value_type;
+    if (iRow >= m_A.numRows ()) {
+      return;
+    }
+    const KokkosSparse::SparseRowViewConst<AMatrix> row = m_A.rowConst(iRow);
+    const ordinal_type row_length = static_cast<ordinal_type> (row.length);
+    y_value_type sum = 0;
+
+    for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++)
+    {
+      const value_type val = conjugate ?
+              ATV::conj (row.value(iEntry)) :
+              row.value(iEntry);
+      sum += val * m_x(row.colidx(iEntry));
+    }
+
+    sum *= alpha;
+
+    if (dobeta == 0) {
+      m_y(iRow) = sum ;
+    } else {
+      m_y(iRow) = beta * m_y(iRow) + sum;
+    }
+  }
+
   KOKKOS_INLINE_FUNCTION
   void operator() (const team_member& dev) const
   {
-    typedef typename YVector::non_const_value_type y_value_type;
+    using y_value_type = typename YVector::non_const_value_type;
 
     Kokkos::parallel_for(Kokkos::TeamThreadRange(dev,0,rows_per_team), [&] (const ordinal_type& loop) {
 
@@ -213,9 +241,19 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th
 
   if(nnz_per_row < 1) nnz_per_row = 1;
 
+  int max_vector_length = 1;
+#ifdef KOKKOS_ENABLE_CUDA
+  if(std::is_same<execution_space, Kokkos::Cuda>::value)
+    max_vector_length = 32;
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+  if(std::is_same<execution_space, Kokkos::Experimental::HIP>::value)
+    max_vector_length = 64;
+#endif
+
   if(vector_length < 1) {
     vector_length = 1;
-    while(vector_length<32 && vector_length*6 < nnz_per_row)
+    while(vector_length < max_vector_length && vector_length * 6 < nnz_per_row)
       vector_length*=2;
   }
 
@@ -280,21 +318,14 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls,
      ((int) A.graph.row_block_offsets.extent(0) == (int) omp_get_max_threads()+1) &&
      (((uintptr_t)(const void*)(x.data())%64)==0) && (((uintptr_t)(const void*)(y.data())%64)==0)
      ) {
+    //Note BMK: this case is typically not called in practice even for OpenMP, since
+    //it requires row_block_offsets to have been computed in the graph.
     spmv_raw_openmp_no_transpose<AMatrix,XVector,YVector>(alpha,A,x,beta,y);
     return;
   }
   #endif
-  int team_size = -1;
-  int vector_length = -1;
-  int64_t rows_per_thread = -1;
-
-  // Note on 03/24/20, lbv: We can use the controls
-  // here to allow the user to pass in some tunning
-  // parameters.
-  if(controls.isParameter("team size"))       {team_size       = std::stoi(controls.getParameter("team size"));}
-  if(controls.isParameter("vector length"))   {vector_length   = std::stoi(controls.getParameter("vector length"));}
-  if(controls.isParameter("rows per thread")) {rows_per_thread = std::stoll(controls.getParameter("rows per thread"));}
 
+  bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>();
   bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule
   bool use_static_schedule  = false; // Forces the use of a static schedule
   if(controls.isParameter("schedule")) {
@@ -304,26 +335,45 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls,
       use_static_schedule  = true;
     }
   }
-
-  int64_t rows_per_team = spmv_launch_parameters<execution_space>(A.numRows(),A.nnz(),rows_per_thread,team_size,vector_length);
-  int64_t worksets = (y.extent(0)+rows_per_team-1)/rows_per_team;
-
-  SPMV_Functor<AMatrix,XVector,YVector,dobeta,conjugate> func (alpha,A,x,beta,y,rows_per_team);
-
-  if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule) {
-    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> > policy(1,1);
-    if(team_size<0)
-      policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> >(worksets,Kokkos::AUTO,vector_length);
-    else
-      policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> >(worksets,team_size,vector_length);
-    Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Dynamic>",policy,func);
-  } else {
-    Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> > policy(1,1);
-    if(team_size<0)
-      policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,Kokkos::AUTO,vector_length);
+  if(use_teams) {
+    int team_size = -1;
+    int vector_length = -1;
+    int64_t rows_per_thread = -1;
+
+    // Note on 03/24/20, lbv: We can use the controls
+    // here to allow the user to pass in some tunning
+    // parameters.
+    if(controls.isParameter("team size"))       {team_size       = std::stoi(controls.getParameter("team size"));}
+    if(controls.isParameter("vector length"))   {vector_length   = std::stoi(controls.getParameter("vector length"));}
+    if(controls.isParameter("rows per thread")) {rows_per_thread = std::stoll(controls.getParameter("rows per thread"));}
+
+    int64_t rows_per_team = spmv_launch_parameters<execution_space>(A.numRows(),A.nnz(),rows_per_thread,team_size,vector_length);
+    int64_t worksets = (y.extent(0)+rows_per_team-1)/rows_per_team;
+
+    SPMV_Functor<AMatrix,XVector,YVector,dobeta,conjugate> func (alpha,A,x,beta,y,rows_per_team);
+
+    if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule) {
+      Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> > policy(1,1);
+      if(team_size<0)
+        policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> >(worksets,Kokkos::AUTO,vector_length);
+      else
+        policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic> >(worksets,team_size,vector_length);
+      Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Dynamic>",policy,func);
+    } else {
+      Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> > policy(1,1);
+      if(team_size<0)
+        policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,Kokkos::AUTO,vector_length);
+      else
+        policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,team_size,vector_length);
+      Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Static>",policy,func);
+    }
+  }
+  else {
+    SPMV_Functor<AMatrix,XVector,YVector,dobeta,conjugate> func (alpha,A,x,beta,y,1);
+    if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule)
+      Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Dynamic>",Kokkos::RangePolicy<execution_space, Kokkos::Schedule<Kokkos::Dynamic>>(0, A.numRows()),func);
     else
-      policy = Kokkos::TeamPolicy<execution_space, Kokkos::Schedule<Kokkos::Static> >(worksets,team_size,vector_length);
-    Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Static>",policy,func);
+      Kokkos::parallel_for("KokkosSparse::spmv<NoTranspose,Static>",Kokkos::RangePolicy<execution_space, Kokkos::Schedule<Kokkos::Static>>(0, A.numRows()),func);
   }
 }
 
@@ -339,7 +389,8 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha,
                            typename YVector::const_value_type& beta,
                            const YVector& y)
 {
-  typedef typename AMatrix::ordinal_type ordinal_type;
+  using ordinal_type = typename AMatrix::non_const_ordinal_type;
+  using size_type = typename AMatrix::non_const_size_type;
 
   if (A.numRows () <= static_cast<ordinal_type> (0)) {
     return;
@@ -351,15 +402,23 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha,
     KokkosBlas::scal (y, beta, y);
   }
 
-  typedef typename AMatrix::size_type size_type;
-
   // Assuming that no row contains duplicate entries, NNZPerRow
   // cannot be more than the number of columns of the matrix.  Thus,
   // the appropriate type is ordinal_type.
-  const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
+  const ordinal_type NNZPerRow = A.nnz () / A.numRows ();
 
   int vector_length = 1;
-  while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2;
+  int max_vector_length = 1;
+#ifdef KOKKOS_ENABLE_CUDA
+  if(std::is_same<execution_space, Kokkos::Cuda>::value)
+    max_vector_length = 32;
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+  if(std::is_same<execution_space, Kokkos::Experimental::HIP>::value)
+    max_vector_length = 64;
+#endif
+  while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) )
+    vector_length*=2;
 
   typedef SPMV_Transpose_Functor<AMatrix, XVector, YVector, conjugate> OpType;
 
@@ -367,9 +426,9 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha,
 
   OpType op (alpha, A, x, y);
 
-  const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space > (NNZPerRow);
-  const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-  const int rows_per_team = rows_per_thread * team_size;
+  const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space > (NNZPerRow);
+  const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+  const ordinal_type rows_per_team = rows_per_thread * team_size;
   op.rows_per_team = rows_per_team;
   const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
   Kokkos::parallel_for("KokkosSparse::spmv<Transpose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
@@ -626,6 +685,65 @@ struct SPMV_MV_LayoutLeft_Functor {
     }
   }
 
+  template<int UNROLL>
+  KOKKOS_INLINE_FUNCTION void
+  strip_mine (const ordinal_type& iRow, const ordinal_type& kk) const
+  {
+    y_value_type sum[UNROLL];
+
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+#pragma unroll
+#endif
+    for (int k = 0; k < UNROLL; ++k) {
+      sum[k] = Kokkos::Details::ArithTraits<y_value_type>::zero ();
+    }
+
+    const auto row = m_A.rowConst (iRow);
+
+    // The correct type of iEntry is ordinal_type, the type of the
+    // number of columns in the (local) matrix.  This is because we
+    // assume either that rows have no duplicate entries, or that rows
+    // never have enough duplicate entries to overflow ordinal_type.
+
+    for(ordinal_type iEntry = 0; iEntry < row.length; iEntry++)
+    {
+      const A_value_type val = conjugate ?
+        Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
+        row.value(iEntry);
+      const ordinal_type ind = row.colidx(iEntry);
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+#pragma unroll
+#endif
+      for (int k = 0; k < UNROLL; ++k) {
+        if(doalpha == 1)
+          sum[k] += val * m_x(ind, kk + k);
+        else if(doalpha == -1)
+          sum[k] -= val * m_x(ind, kk + k);
+        else
+          sum[k] += alpha * val * m_x(ind, kk + k);
+      }
+    }
+
+    if(doalpha == -1)
+
+    if (dobeta == 0) {
+      for(ordinal_type k = 0; k < UNROLL; k++)
+        m_y(iRow, kk + k) = sum[k];
+    } else if (dobeta == 1) {
+      for(ordinal_type k = 0; k < UNROLL; k++)
+        m_y(iRow, kk + k) = m_y(iRow, kk + k) + sum[k];
+    } else if (dobeta == -1) {
+      for(ordinal_type k = 0; k < UNROLL; k++)
+        m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k];
+    } else {
+      for(ordinal_type k = 0; k < UNROLL; k++)
+        m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k];
+    }
+  }
+
   KOKKOS_INLINE_FUNCTION void
   strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const
   {
@@ -666,6 +784,141 @@ struct SPMV_MV_LayoutLeft_Functor {
     });
   }
 
+  KOKKOS_INLINE_FUNCTION void
+  strip_mine_1 (const ordinal_type& iRow) const
+  {
+    const auto row = m_A.rowConst (iRow);
+
+    // The correct type of iEntry is ordinal_type, the type of the
+    // number of columns in the (local) matrix.  This is because we
+    // assume either that rows have no duplicate entries, or that rows
+    // never have enough duplicate entries to overflow ordinal_type.
+
+    y_value_type sum = y_value_type();
+    for(ordinal_type iEntry = 0; iEntry < row.length; iEntry++)
+    {
+      const A_value_type val = conjugate ?
+          Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
+          row.value(iEntry);
+      sum += val * m_x(row.colidx(iEntry),0);
+    }
+    if (doalpha == -1) {
+      sum = -sum;
+    } else if (doalpha != 1) {
+      sum *= alpha;
+    }
+
+    if (dobeta == 0) {
+      m_y(iRow, 0) = sum ;
+    } else if (dobeta == 1) {
+      m_y(iRow, 0) += sum ;
+    } else if (dobeta == -1) {
+      m_y(iRow, 0) = -m_y(iRow, 0) +  sum;
+    } else {
+      m_y(iRow, 0) = beta * m_y(iRow, 0) + sum;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION void
+  operator() (const ordinal_type& iRow) const
+  {
+    // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it
+    // needs to have the same type as n.
+    ordinal_type kk = 0;
+
+#ifdef KOKKOS_FAST_COMPILE
+    for (; kk + 4 <= n; kk += 4) {
+      strip_mine<4>(dev, iRow, kk);
+    }
+    for( ; kk < n; ++kk) {
+      strip_mine<1>(dev, iRow, kk);
+    }
+#else
+#  if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+      if ((n > 8) && (n % 8 == 1)) {
+        strip_mine<9>(iRow, kk);
+        kk += 9;
+      }
+      for(; kk + 8 <= n; kk += 8)
+        strip_mine<8>(iRow, kk);
+      if(kk < n) {
+        switch(n - kk) {
+#  else // NOT a GPU
+      if ((n > 16) && (n % 16 == 1)) {
+        strip_mine<17>(iRow, kk);
+        kk += 17;
+      }
+
+      for (; kk + 16 <= n; kk += 16) {
+        strip_mine<16>(iRow, kk);
+      }
+
+      if(kk < n) {
+        switch(n - kk) {
+        case 15:
+          strip_mine<15>(iRow, kk);
+          break;
+
+        case 14:
+          strip_mine<14>(iRow, kk);
+          break;
+
+        case 13:
+          strip_mine<13>(iRow, kk);
+          break;
+
+        case 12:
+          strip_mine<12>(iRow, kk);
+          break;
+
+        case 11:
+          strip_mine<11>(iRow, kk);
+          break;
+
+        case 10:
+          strip_mine<10>(iRow, kk);
+          break;
+
+        case 9:
+          strip_mine<9>(iRow, kk);
+          break;
+
+        case 8:
+          strip_mine<8>(iRow, kk);
+          break;
+#  endif // if/else: __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__
+        case 7:
+          strip_mine<7>(iRow, kk);
+          break;
+
+        case 6:
+          strip_mine<6>(iRow, kk);
+          break;
+
+        case 5:
+          strip_mine<5>(iRow, kk);
+          break;
+
+        case 4:
+          strip_mine<4>(iRow, kk);
+          break;
+
+        case 3:
+          strip_mine<3>(iRow, kk);
+          break;
+
+        case 2:
+          strip_mine<2>(iRow, kk);
+          break;
+
+        case 1:
+          strip_mine_1(iRow);
+          break;
+        }
+      }
+#endif // KOKKOS_FAST_COMPILE
+    }
+
 
   KOKKOS_INLINE_FUNCTION void
   operator() (const team_member& dev) const
@@ -794,7 +1047,8 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a
                                  const typename YVector::non_const_value_type& beta,
                                  const YVector& y)
 {
-  typedef typename AMatrix::ordinal_type ordinal_type;
+  using ordinal_type = typename AMatrix::non_const_ordinal_type;
+  using size_type = typename AMatrix::non_const_size_type;
 
   if (A.numRows () <= static_cast<ordinal_type> (0)) {
     return;
@@ -806,16 +1060,18 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a
     return;
   }
   else {
-    typedef typename AMatrix::size_type size_type;
 
     // Assuming that no row contains duplicate entries, NNZPerRow
     // cannot be more than the number of columns of the matrix.  Thus,
     // the appropriate type is ordinal_type.
-    const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
+    const ordinal_type NNZPerRow = A.nnz () / A.numRows ();
 
-    int vector_length = 1;
-    if(KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>())
-      while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
+    bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>();
+    ordinal_type vector_length = 1;
+    if(use_teams) {
+      while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) )
+        vector_length *= 2;
+    }
 
 #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
 
@@ -825,17 +1081,17 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
 
-    // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-    // instead of int?  For example, if the number of threads is 1,
-    // then this is just the number of rows.  Ditto for rows_per_team.
-    // team_size is a hardware resource thing so it might legitimately
-    // be int.
-    const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-    const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-    const int rows_per_team = rows_per_thread * team_size;
-    const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-    Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
-       ( nteams , team_size , vector_length ) , op );
+    if(use_teams) {
+      const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
+      const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+      const ordinal_type rows_per_team = rows_per_thread * team_size;
+      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
+      Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
+         ( nteams , team_size , vector_length ) , op );
+    }
+    else {
+      Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>", Kokkos::RangePolicy< typename AMatrix::execution_space >( 0, nrow ), op );
+    }
 
 #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
 
@@ -846,18 +1102,18 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a
 
     OpType op (alpha, A, x, beta, y, RowsPerThread<typename AMatrix::execution_space> (NNZPerRow), vector_length);
 
-    // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-    // instead of int?  For example, if the number of threads is 1,
-    // then this is just the number of rows.  Ditto for rows_per_team.
-    // team_size is a hardware resource thing so it might legitimately
-    // be int.
-    const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-    const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-    const int rows_per_team = rows_per_thread * team_size;
-    const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-    Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-       ( nteams , team_size , vector_length ) , op );
-
+    if(use_teams) {
+      const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
+      const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+      const ordinal_type rows_per_team = rows_per_thread * team_size;
+      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
+      Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
+         ( nteams , team_size , vector_length ) , op );
+    }
+    else {
+      Kokkos::parallel_for("KokkosSparse::spmv<MV,NoTranspose>",  Kokkos::RangePolicy< typename AMatrix::execution_space >
+         ( 0, nrow ) , op );
+    }
 #endif // KOKKOS_FAST_COMPILE
   }
 }
@@ -875,7 +1131,8 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
                               const typename YVector::non_const_value_type& beta,
                               const YVector& y)
 {
-  typedef typename AMatrix::ordinal_type ordinal_type;
+  using ordinal_type = typename AMatrix::non_const_ordinal_type;
+  using size_type = typename AMatrix::non_const_size_type;
 
   if (A.numRows () <= static_cast<ordinal_type> (0)) {
     return;
@@ -895,10 +1152,12 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
     // the appropriate type is ordinal_type.
     const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
 
-    int vector_length = 1;
+    ordinal_type vector_length = 1;
     //Transpose functor uses atomics which can't be vectorized on CPU
-    if(KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>())
-      while( (static_cast<ordinal_type> (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2;
+    if(KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>()) {
+      while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) )
+        vector_length*=2;
+    }
 
 #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels
 
@@ -906,16 +1165,11 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
       doalpha, dobeta, conjugate> OpType;
     OpType op (alpha, A, x, beta, y);
 
-    typename AMatrix::const_ordinal_type nrow = A.numRows();
+    const ordinal_type nrow = A.numRows();
 
-    // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-    // instead of int?  For example, if the number of threads is 1,
-    // then this is just the number of rows.  Ditto for rows_per_team.
-    // team_size is a hardware resource thing so it might legitimately
-    // be int.
-    const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-    const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-    const int rows_per_team = rows_per_thread * team_size;
+    const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
+    const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+    const ordinal_type rows_per_team = rows_per_thread * team_size;
     op.rows_per_team = rows_per_team;
     const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
     Kokkos::parallel_for ("KokkosSparse::spmv<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
@@ -930,14 +1184,9 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
 
     OpType op (alpha, A, x, beta, y);
 
-    // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here
-    // instead of int?  For example, if the number of threads is 1,
-    // then this is just the number of rows.  Ditto for rows_per_team.
-    // team_size is a hardware resource thing so it might legitimately
-    // be int.
-    const int rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-    const int team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-    const int rows_per_team = rows_per_thread * team_size;
+    const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
+    const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+    const ordinal_type rows_per_team = rows_per_thread * team_size;
     op.rows_per_team = rows_per_team;
     const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
     Kokkos::parallel_for("KokkosSparse::spmv<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
diff --git a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp
index a4f1c07258..72c8a969fe 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp
@@ -47,7 +47,6 @@ namespace Impl {
 #ifdef KOKKOS_ENABLE_OPENMP
 template<typename AMatrix, typename XVector, typename YVector>
 void spmv_raw_openmp_no_transpose(typename YVector::const_value_type& s_a, AMatrix A, XVector x, typename YVector::const_value_type& s_b, YVector y) {
-
   typedef typename YVector::non_const_value_type value_type;
   typedef typename AMatrix::ordinal_type         ordinal_type;
   typedef typename AMatrix::non_const_size_type            size_type;
diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp
index e27012991a..598f906f8d 100644
--- a/unit_test/sparse/Test_Sparse_spmv.hpp
+++ b/unit_test/sparse/Test_Sparse_spmv.hpp
@@ -450,6 +450,36 @@ void test_spmv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_v
   }
 }
 
+template <typename scalar_t, typename lno_t, typename size_type, typename layout, class Device>
+void test_spmv_mv_heavy(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, int numMV){
+
+  typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, Device, void, size_type> crsMat_t;
+
+  typedef Kokkos::View<scalar_t**, layout, Device> ViewTypeX;
+  typedef Kokkos::View<scalar_t**, layout, Device> ViewTypeY;
+
+  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(numRows,numCols,nnz,row_size_variance, bandwidth);
+  Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
+
+  for(int nv = 1; nv <= numMV; nv++) {
+    ViewTypeX b_x("A",numRows,nv);
+    ViewTypeY b_y("B",numRows,nv);
+    ViewTypeY b_y_copy("B",numRows,nv);
+
+    Kokkos::fill_random(b_x,rand_pool,scalar_t(10));
+    Kokkos::fill_random(b_y,rand_pool,scalar_t(10));
+
+    Kokkos::deep_copy(b_y_copy, b_y);
+
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N');
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N');
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N');
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T');
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T');
+    Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'T');
+  }
+}
+
 template <typename scalar_t, typename lno_t, typename size_type, class Device>
 void test_spmv_struct_1D(lno_t nx, lno_t leftBC, lno_t rightBC) {
 
@@ -816,6 +846,7 @@ TEST_F( TestCategory,sparse ## _ ## spmv_mv ## _ ## SCALAR ## _ ## ORDINAL ## _
   test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (50000, 50000 * 30, 100, 10, 5); \
   test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (50000, 50000 * 30, 200, 10, 1); \
   test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (10000, 10000 * 20, 100, 5, 10); \
+  test_spmv_mv_heavy<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (200, 200 * 10, 60, 4, 50); \
 }
 
 #define EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, DEVICE) \

From 792e48f6a6763b889d7c11d1a598218bdaf0ca4b Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Mon, 26 Oct 2020 21:46:41 -0600
Subject: [PATCH 14/18] Fixed typo in spmv

---
 src/sparse/impl/KokkosSparse_spmv_impl.hpp |  2 --
 unit_test/sparse/Test_Sparse_spmv.hpp      | 11 ++++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
index 558acc363a..55da2dea60 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -727,8 +727,6 @@ struct SPMV_MV_LayoutLeft_Functor {
       }
     }
 
-    if(doalpha == -1)
-
     if (dobeta == 0) {
       for(ordinal_type k = 0; k < UNROLL; k++)
         m_y(iRow, kk + k) = sum[k];
diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp
index 598f906f8d..5a033fdf34 100644
--- a/unit_test/sparse/Test_Sparse_spmv.hpp
+++ b/unit_test/sparse/Test_Sparse_spmv.hpp
@@ -42,7 +42,7 @@ struct fSPMV {
 
     if(error > eps) {
       err++;
-      printf("expected_y(%d)=%f, y(%d)=%f\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i)));
+      //printf("expected_y(%d)=%f, y(%d)=%f\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i)));
     }
   }
 };
@@ -203,8 +203,9 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto
                             my_exec_space(0,y_i.extent(0)),
                             fSPMV<decltype(y_i), decltype(y_spmv)>(y_i, y_spmv, eps),
                             num_errors);
-    if(num_errors>0) printf("KokkosSparse::Test::spmv_mv: %i errors of %i for mv %i\n",
-                            num_errors, y_i.extent_int(0), i);
+    if(num_errors>0)
+      std::cout << "KokkosSparse::Test::spmv_mv: " << num_errors << " errors of " << y_i.extent_int(0)
+        << " for mv " << i << " (alpha=" << alpha << ", beta=" << beta << ", mode = " << mode << ")\n";
     EXPECT_TRUE(num_errors==0);
   }
 }
@@ -458,7 +459,7 @@ void test_spmv_mv_heavy(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_
   typedef Kokkos::View<scalar_t**, layout, Device> ViewTypeX;
   typedef Kokkos::View<scalar_t**, layout, Device> ViewTypeY;
 
-  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(numRows,numCols,nnz,row_size_variance, bandwidth);
+  crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix<crsMat_t>(numRows,numRows,nnz,row_size_variance, bandwidth);
   Kokkos::Random_XorShift64_Pool<typename Device::execution_space> rand_pool(13718);
 
   for(int nv = 1; nv <= numMV; nv++) {
@@ -846,7 +847,7 @@ TEST_F( TestCategory,sparse ## _ ## spmv_mv ## _ ## SCALAR ## _ ## ORDINAL ## _
   test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (50000, 50000 * 30, 100, 10, 5); \
   test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (50000, 50000 * 30, 200, 10, 1); \
   test_spmv_mv<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (10000, 10000 * 20, 100, 5, 10); \
-  test_spmv_mv_heavy<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (200, 200 * 10, 60, 4, 50); \
+  test_spmv_mv_heavy<SCALAR,ORDINAL,OFFSET,Kokkos::LAYOUT,DEVICE> (200, 200 * 10, 60, 4, 30); \
 }
 
 #define EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, DEVICE) \

From 55933aa0790b248c6171c080e238c87fb8108787 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Mon, 26 Oct 2020 22:50:44 -0600
Subject: [PATCH 15/18] Use range policy for omp mode T spmv/spmv_mv

---
 src/sparse/impl/KokkosSparse_spmv_impl.hpp | 123 ++++++++++++++++-----
 1 file changed, 95 insertions(+), 28 deletions(-)

diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
index 55da2dea60..e3934e9d5d 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -105,6 +105,21 @@ struct SPMV_Transpose_Functor {
     alpha (alpha_), m_A (m_A_), m_x (m_x_), m_y (m_y_)
   {}
 
+  KOKKOS_INLINE_FUNCTION void
+  operator() (const ordinal_type iRow) const
+  {
+    const auto row = m_A.rowConst (iRow);
+    const ordinal_type row_length = row.length;
+    for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++)
+    {
+      const value_type val = conjugate ?
+        ATV::conj (row.value(iEntry)) :
+        row.value(iEntry);
+      const ordinal_type ind = row.colidx(iEntry);
+      Kokkos::atomic_add (&m_y(ind), static_cast<y_value_type> (alpha * val * m_x(iRow)));
+    }
+  }
+
   KOKKOS_INLINE_FUNCTION void
   operator() (const team_member& dev) const
   {
@@ -408,6 +423,7 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha,
   const ordinal_type NNZPerRow = A.nnz () / A.numRows ();
 
   int vector_length = 1;
+  bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>();
   int max_vector_length = 1;
 #ifdef KOKKOS_ENABLE_CUDA
   if(std::is_same<execution_space, Kokkos::Cuda>::value)
@@ -417,8 +433,10 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha,
   if(std::is_same<execution_space, Kokkos::Experimental::HIP>::value)
     max_vector_length = 64;
 #endif
-  while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) )
-    vector_length*=2;
+  if(use_teams) {
+    while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) )
+      vector_length*=2;
+  }
 
   typedef SPMV_Transpose_Functor<AMatrix, XVector, YVector, conjugate> OpType;
 
@@ -426,14 +444,19 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha,
 
   OpType op (alpha, A, x, y);
 
-  const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space > (NNZPerRow);
-  const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-  const ordinal_type rows_per_team = rows_per_thread * team_size;
-  op.rows_per_team = rows_per_team;
-  const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-  Kokkos::parallel_for("KokkosSparse::spmv<Transpose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
-     ( nteams , team_size , vector_length ) , op );
-
+  if(use_teams) {
+    const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space > (NNZPerRow);
+    const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+    const ordinal_type rows_per_team = rows_per_thread * team_size;
+    op.rows_per_team = rows_per_team;
+    const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
+    Kokkos::parallel_for("KokkosSparse::spmv<Transpose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
+       ( nteams , team_size , vector_length ) , op );
+  }
+  else {
+    Kokkos::parallel_for("KokkosSparse::spmv<Transpose>", Kokkos::RangePolicy< typename AMatrix::execution_space >
+       ( 0 , nrow ) , op );
+  }
 }
 
 template<class AMatrix,
@@ -507,6 +530,39 @@ struct SPMV_MV_Transpose_Functor {
     m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1))
   {}
 
+  KOKKOS_INLINE_FUNCTION void
+  operator() (const ordinal_type iRow) const
+  {
+    const auto row = m_A.rowConst (iRow);
+    const ordinal_type row_length = row.length;
+
+    for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++)
+    {
+      const A_value_type val = conjugate ?
+        Kokkos::Details::ArithTraits<A_value_type>::conj (row.value(iEntry)) :
+        row.value(iEntry);
+      const ordinal_type ind = row.colidx(iEntry);
+
+      if (doalpha != 1) {
+        #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+        #pragma unroll
+        #endif
+        for (ordinal_type k = 0; k < n; ++k) {
+          Kokkos::atomic_add (&m_y(ind,k),
+                              static_cast<y_value_type> (alpha * val * m_x(iRow, k)));
+        }
+      } else {
+        #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+        #pragma unroll
+        #endif
+        for (ordinal_type k = 0; k < n; ++k) {
+          Kokkos::atomic_add (&m_y(ind,k),
+                              static_cast<y_value_type> (val * m_x(iRow, k)));
+        }
+      }
+    }
+  }
+
   KOKKOS_INLINE_FUNCTION void
   operator() (const team_member& dev) const
   {
@@ -1151,8 +1207,9 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
     const ordinal_type NNZPerRow = static_cast<ordinal_type> (A.nnz () / A.numRows ());
 
     ordinal_type vector_length = 1;
+    bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>();
     //Transpose functor uses atomics which can't be vectorized on CPU
-    if(KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>()) {
+    if(use_teams) {
       while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) )
         vector_length*=2;
     }
@@ -1164,14 +1221,19 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
     OpType op (alpha, A, x, beta, y);
 
     const ordinal_type nrow = A.numRows();
-
-    const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-    const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-    const ordinal_type rows_per_team = rows_per_thread * team_size;
-    op.rows_per_team = rows_per_team;
-    const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-    Kokkos::parallel_for ("KokkosSparse::spmv<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-       ( nteams , team_size , vector_length ) , op );
+    if(use_teams) {
+      const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
+      const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+      const ordinal_type rows_per_team = rows_per_thread * team_size;
+      op.rows_per_team = rows_per_team;
+      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
+      Kokkos::parallel_for ("KokkosSparse::spmv<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
+         ( nteams , team_size , vector_length ) , op );
+    }
+    else {
+      Kokkos::parallel_for ("KokkosSparse::spmv<MV,Transpose>",  Kokkos::RangePolicy < typename AMatrix::execution_space >
+         ( 0 , nrow ) , op );
+    }
 
 #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta
 
@@ -1179,16 +1241,21 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
       2, 2, conjugate, SizeType> OpType;
 
     typename AMatrix::const_ordinal_type nrow = A.numRows();
+    if(use_teams) {
+      OpType op (alpha, A, x, beta, y);
 
-    OpType op (alpha, A, x, beta, y);
-
-    const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
-    const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
-    const ordinal_type rows_per_team = rows_per_thread * team_size;
-    op.rows_per_team = rows_per_team;
-    const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-    Kokkos::parallel_for("KokkosSparse::spmv<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
-       ( nteams , team_size , vector_length ) , op );
+      const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space >(NNZPerRow);
+      const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+      const ordinal_type rows_per_team = rows_per_thread * team_size;
+      op.rows_per_team = rows_per_team;
+      const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
+      Kokkos::parallel_for("KokkosSparse::spmv<MV,Transpose>",  Kokkos::TeamPolicy< typename AMatrix::execution_space >
+         ( nteams , team_size , vector_length ) , op );
+    }
+    else {
+      Kokkos::parallel_for("KokkosSparse::spmv<MV,Transpose>",  Kokkos::RangePolicy< typename AMatrix::execution_space >
+         ( 0, nrow ) , op );
+    }
 
 #endif // KOKKOS_FAST_COMPILE
   }

From 80fc49c1e16236f6b33115d972bbe584ac602c85 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Tue, 27 Oct 2020 08:58:56 -0600
Subject: [PATCH 16/18] Remove duplicate local typedef

---
 src/sparse/impl/KokkosSparse_spmv_impl.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
index e3934e9d5d..d42a0c81b2 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -1199,7 +1199,6 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph
   }
 
   if (doalpha != 0) {
-    typedef typename AMatrix::size_type size_type;
 
     // Assuming that no row contains duplicate entries, NNZPerRow
     // cannot be more than the number of columns of the matrix.  Thus,

From 1a35a8d33a6acceacae542700c1b1ad375bb0aa5 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Tue, 27 Oct 2020 09:05:09 -0600
Subject: [PATCH 17/18] Remove unused var

---
 perf_test/sparse/KokkosSparse_kk_spmv.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp
index 07c29e3735..aa8f2ddfa3 100644
--- a/perf_test/sparse/KokkosSparse_kk_spmv.cpp
+++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp
@@ -80,7 +80,6 @@ void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop,
   }
   numRows = A.numRows();
   numCols = A.numCols();
-  Offset nnz = A.nnz();
   mv_type x("X", numCols, num_vecs);
   mv_type y("Y", numRows, num_vecs);
   h_mv_type h_x = Kokkos::create_mirror_view(x);

From d3909de4a28049a507e8983cbac6febecb188010 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Tue, 27 Oct 2020 10:03:34 -0600
Subject: [PATCH 18/18] Fix execution_space typedef

---
 src/sparse/impl/KokkosSparse_spmv_impl.hpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
index d42a0c81b2..7b91f95e09 100644
--- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp
@@ -406,6 +406,7 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha,
 {
   using ordinal_type = typename AMatrix::non_const_ordinal_type;
   using size_type = typename AMatrix::non_const_size_type;
+  using execution_space = typename AMatrix::execution_space;
 
   if (A.numRows () <= static_cast<ordinal_type> (0)) {
     return;
@@ -423,7 +424,7 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha,
   const ordinal_type NNZPerRow = A.nnz () / A.numRows ();
 
   int vector_length = 1;
-  bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space<typename AMatrix::execution_space>();
+  bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>();
   int max_vector_length = 1;
 #ifdef KOKKOS_ENABLE_CUDA
   if(std::is_same<execution_space, Kokkos::Cuda>::value)
@@ -445,16 +446,16 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha,
   OpType op (alpha, A, x, y);
 
   if(use_teams) {
-    const ordinal_type rows_per_thread = RowsPerThread<typename AMatrix::execution_space > (NNZPerRow);
-    const ordinal_type team_size = Kokkos::TeamPolicy<typename AMatrix::execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
+    const ordinal_type rows_per_thread = RowsPerThread<execution_space > (NNZPerRow);
+    const ordinal_type team_size = Kokkos::TeamPolicy<execution_space>(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag());
     const ordinal_type rows_per_team = rows_per_thread * team_size;
     op.rows_per_team = rows_per_team;
     const size_type nteams = (nrow+rows_per_team-1)/rows_per_team;
-    Kokkos::parallel_for("KokkosSparse::spmv<Transpose>", Kokkos::TeamPolicy< typename AMatrix::execution_space >
+    Kokkos::parallel_for("KokkosSparse::spmv<Transpose>", Kokkos::TeamPolicy< execution_space >
        ( nteams , team_size , vector_length ) , op );
   }
   else {
-    Kokkos::parallel_for("KokkosSparse::spmv<Transpose>", Kokkos::RangePolicy< typename AMatrix::execution_space >
+    Kokkos::parallel_for("KokkosSparse::spmv<Transpose>", Kokkos::RangePolicy< execution_space >
        ( 0 , nrow ) , op );
   }
 }