From e8112f7d14fe547095bad2b28184fc130345fa55 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 6 Oct 2020 11:49:39 -0600 Subject: [PATCH 01/18] WIP: adding HIP codepaths in preparation for tests/ETI --- perf_test/graph/KokkosGraph_color.cpp | 9 + perf_test/graph/KokkosGraph_color_d2.cpp | 23 +- perf_test/graph/KokkosGraph_mis_d2.cpp | 18 +- perf_test/graph/KokkosGraph_triangle.cpp | 19 +- perf_test/sparse/KokkosSparse_pcg.cpp | 291 ++--- perf_test/sparse/KokkosSparse_spadd.cpp | 2 +- perf_test/sparse/KokkosSparse_spgemm.cpp | 17 +- src/Kokkos_ArithTraits.hpp | 11 +- .../KokkosBatched_Gemm_Team_Internal.hpp | 20 +- .../KokkosBatched_Trsm_Team_Internal.hpp | 20 +- .../KokkosBatched_Trsv_Serial_Internal.hpp | 2 +- .../KokkosBatched_Trsv_Team_Internal.hpp | 2 +- src/batched/KokkosBatched_Util.hpp | 2 +- src/batched/KokkosBatched_Vector.hpp | 38 + src/batched/KokkosBatched_Vector_SIMD.hpp | 10 +- .../KokkosBatched_Vector_SIMD_Arith.hpp | 8 +- src/blas/impl/KokkosBlas2_gemv_impl.hpp | 4 +- src/blas/impl/KokkosBlas3_gemm_impl.hpp | 7 + src/blas/impl/KokkosBlas3_gemm_spec.hpp | 4 + src/common/KokkosKernels_BitUtils.hpp | 1 + src/common/KokkosKernels_ExecSpaceUtils.hpp | 86 +- src/common/KokkosKernels_Handle.hpp | 2 +- src/common/KokkosKernels_Macros.hpp | 4 +- src/common/KokkosKernels_SparseUtils.hpp | 168 +-- ...Kernels_Uniform_Initialized_MemoryPool.hpp | 3 +- src/common/KokkosKernels_Utils.hpp | 67 +- src/common/KokkosKernels_default_types.hpp | 2 + .../KokkosGraph_Distance1ColorHandle.hpp | 70 +- .../KokkosGraph_Distance2ColorHandle.hpp | 62 +- .../impl/KokkosGraph_Distance2MIS_impl.hpp | 2 +- src/sparse/KokkosSparse_CrsMatrix.hpp | 6 + .../KokkosSparse_gauss_seidel_handle.hpp | 74 +- src/sparse/KokkosSparse_spadd.hpp | 61 - src/sparse/KokkosSparse_spgemm_handle.hpp | 76 +- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 47 +- .../impl/KokkosSparse_partitioning_impl.hpp | 529 --------- .../KokkosSparse_spgemm_impl_compression.hpp | 53 +- .../impl/KokkosSparse_spgemm_impl_def.hpp | 5 +- .../impl/KokkosSparse_spgemm_impl_kkmem.hpp | 26 +- .../impl/KokkosSparse_spgemm_impl_speed.hpp | 8 +- .../KokkosSparse_spgemm_impl_symbolic.hpp | 86 +- .../KokkosSparse_spgemm_impl_triangle.hpp | 48 +- ...se_spgemm_impl_triangle_no_compression.hpp | 46 +- ...kosSparse_spgemm_jacobi_sparseacc_impl.hpp | 26 +- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 451 +++----- .../impl/KokkosSparse_spmv_struct_impl.hpp | 1005 ++++++++--------- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 17 + test_common/KokkosKernels_TestParameters.hpp | 2 + 48 files changed, 1312 insertions(+), 2228 deletions(-) diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index cbc3697517..f7d8a93e80 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -579,6 +579,15 @@ int main (int argc, char ** argv){ #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (params.use_hip) { + KokkosKernels::Experiment::run_multi_mem_experiment + ( + params + ); + } +#endif + #if defined( KOKKOS_ENABLE_SERIAL ) if (params.use_serial) { #ifdef KOKKOSKERNELS_MULTI_MEM diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp index 970bafa380..04d977527d 100644 --- a/perf_test/graph/KokkosGraph_color_d2.cpp +++ b/perf_test/graph/KokkosGraph_color_d2.cpp @@ -81,6 +81,7 @@ struct D2Parameters int use_threads; int use_openmp; int use_cuda; + int use_hip; int use_serial; const char* mtx_file; ColoringMode d2_color_type; @@ -93,6 +94,7 @@ struct D2Parameters use_threads = 0; use_openmp = 0; use_cuda = 0; + use_hip = 0; use_serial = 0; mtx_file = NULL; d2_color_type = MODE_D2_SYMMETRIC; @@ -147,6 +149,9 @@ void print_options(std::ostream &os, const char *app_name, unsigned int indent = #endif #ifdef KOKKOS_ENABLE_CUDA << spaces << " --cuda Use given CUDA device" << std::endl +#endif +#ifdef KOKKOS_ENABLE_HIP + << spaces << " --hip Use given HIP device" << std::endl #endif << std::endl << spaces << " Coloring modes:" << std::endl @@ -199,6 +204,10 @@ int parse_inputs(D2Parameters ¶ms, int argc, char **argv) { params.use_cuda = 1 + atoi(getNextArg(i, argc, argv)); } + else if(0 == strcasecmp(argv[i], "--hip")) + { + params.use_hip = 1 + atoi(getNextArg(i, argc, argv)); + } else if(0 == strcasecmp(argv[i], "--repeat")) { params.repeat = atoi(getNextArg(i, argc, argv)); @@ -273,7 +282,7 @@ int parse_inputs(D2Parameters ¶ms, int argc, char **argv) print_options(std::cout, argv[0]); return 1; } - if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda) + if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip) { print_options(std::cout, argv[0]); return 1; @@ -603,6 +612,8 @@ int main(int argc, char *argv[]) int device_id = 0; if(params.use_cuda) device_id = params.use_cuda - 1; + else if(params.use_hip) + device_id = params.use_hip - 1; Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); // Print out verbose information about the configuration of the run. @@ -645,6 +656,16 @@ int main(int argc, char *argv[]) } #endif + #if defined(KOKKOS_ENABLE_HIP) + if(params.use_hip) + { + if(!use_multi_mem) + { + KokkosKernels::Experiment::experiment_driver(params); + } + } + #endif + #if defined(KOKKOS_ENABLE_SERIAL) if(params.use_serial) { diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp index da9fb549d6..32ff5f5fbd 100644 --- a/perf_test/graph/KokkosGraph_mis_d2.cpp +++ b/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -75,6 +75,7 @@ struct MIS2Parameters int use_threads = 0; int use_openmp = 0; int use_cuda = 0; + int use_hip = 0; int use_serial = 0; const char* mtx_file = NULL; MIS2_Algorithm algo = MIS2_FAST; @@ -163,6 +164,9 @@ void print_options(std::ostream &os, const char *app_name, unsigned int indent = #endif #ifdef KOKKOS_ENABLE_CUDA << spaces << " --cuda Use CUDA.\n" +#endif +#ifdef KOKKOS_ENABLE_HIP + << spaces << " --hip Use HIP.\n" #endif << std::endl << spaces << " Optional Parameters:" << std::endl @@ -205,6 +209,10 @@ int parse_inputs(MIS2Parameters ¶ms, int argc, char **argv) { params.use_cuda = 1; } + else if(0 == strcasecmp(argv[i], "--hip")) + { + params.use_hip = 1; + } else if(0 == strcasecmp(argv[i], "--repeat")) { params.repeat = atoi(getNextArg(i, argc, argv)); @@ -252,7 +260,7 @@ int parse_inputs(MIS2Parameters ¶ms, int argc, char **argv) print_options(std::cout, argv[0]); return 1; } - if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda) + if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda && !params.use_hip) { print_options(std::cout, argv[0]); return 1; @@ -362,6 +370,14 @@ int main(int argc, char *argv[]) } #endif + #if defined(KOKKOS_ENABLE_HIP) + if(params.use_hip) + { + run_mis2(params); + run = true; + } + #endif + #if defined(KOKKOS_ENABLE_SERIAL) if(params.use_serial) { diff --git a/perf_test/graph/KokkosGraph_triangle.cpp b/perf_test/graph/KokkosGraph_triangle.cpp index 6f0b6c73df..63a52dbaea 100644 --- a/perf_test/graph/KokkosGraph_triangle.cpp +++ b/perf_test/graph/KokkosGraph_triangle.cpp @@ -54,7 +54,7 @@ void print_options(){ std::cerr << "Options\n" << std::endl; - std::cerr << "Choose BackEnd : --openmp [numthreads] | --cuda" << std::endl; + std::cerr << "Choose BackEnd : --openmp [numthreads] | --cuda | --hip" << std::endl; std::cerr << "Input Matrix : --amtx [path_to_input_matrix]" << std::endl; std::cerr << "\tInput Matrix format can be multiple formats. If it ends with:" << std::endl; std::cerr << "\t\t.mtx: it will read matrix market format." << std::endl; @@ -96,6 +96,9 @@ int parse_inputs (KokkosKernels::Experiment::Parameters ¶ms, int argc, char else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) { params.use_cuda = 1; } + else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) { + params.use_hip = 1; + } else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) { params.repeat = atoi( argv[++i] ); } @@ -292,7 +295,6 @@ int main (int argc, char ** argv){ const int device_id = 0; Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) ); -#if !defined (KOKKOS_ENABLE_CUDA) #if defined( KOKKOS_ENABLE_OPENMP ) if (params.use_openmp) { @@ -311,10 +313,9 @@ int main (int argc, char ** argv){ } #endif -#endif -#if defined( KOKKOS_ENABLE_CUDA1 ) +#if defined( KOKKOS_ENABLE_CUDA ) if (params.use_cuda) { Kokkos::Cuda::print_configuration(std::cout); #ifdef KOKKOSKERNELS_MULTI_MEM @@ -332,6 +333,16 @@ int main (int argc, char ** argv){ #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (params.use_hip) { + Kokkos::Experimental::HIP::print_configuration(std::cout); + KokkosKernels::Experiment::run_multi_mem_triangle + ( + params + ); + } +#endif + Kokkos::finalize(); return 0; diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index 681327dfaf..0f6351189b 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -43,32 +43,24 @@ */ #include -#if defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) #include "KokkosSparse_pcg.hpp" #include "KokkosKernels_Utils.hpp" -#include #include "KokkosKernels_IOUtils.hpp" +#include "KokkosKernels_default_types.hpp" +#include #define MAXVAL 1 -#define SIZE_TYPE size_t -#define INDEX_TYPE int -#define SCALAR_TYPE double - - - template -scalar_view_t create_x_vector(INDEX_TYPE nv, SCALAR_TYPE max_value = 1.0){ +scalar_view_t create_x_vector(default_lno_t nv, default_scalar max_value = 1.0){ scalar_view_t kok_x ("X", nv); typename scalar_view_t::HostMirror h_x = Kokkos::create_mirror_view (kok_x); - for (INDEX_TYPE i = 0; i < nv; ++i){ - SCALAR_TYPE r = static_cast (rand()) / static_cast (RAND_MAX / max_value); + for (default_lno_t i = 0; i < nv; ++i){ + default_scalar r = static_cast (rand()) / static_cast (RAND_MAX / max_value); h_x(i) = r; } Kokkos::deep_copy (kok_x, h_x); @@ -98,7 +90,7 @@ void run_experiment( typedef typename lno_view_t::value_type size_type; typedef typename scalar_view_t::value_type scalar_t; - INDEX_TYPE nv = crsmat.numRows(); + default_lno_t nv = crsmat.numRows(); scalar_view_t kok_x_original = create_x_vector(nv, MAXVAL); scalar_view_t kok_b_vector = create_y_vector(crsmat, kok_x_original); @@ -255,25 +247,70 @@ void run_experiment( */ } - - - enum { CMD_USE_THREADS = 0 , CMD_USE_NUMA , CMD_USE_CORE_PER_NUMA , CMD_USE_CUDA + , CMD_USE_HIP , CMD_USE_OPENMP - , CMD_USE_CUDA_DEV + , CMD_DEVICE , CMD_BIN_MTX , CMD_CLUSTER_SIZE , CMD_USE_SEQUENTIAL_SGS , CMD_ERROR , CMD_COUNT }; +template +void run_pcg(int* cmdline, const char* mtx_file) +{ + default_lno_t nv = 0, ne = 0; + default_lno_t *xadj, *adj; + default_scalar *ew; + + KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_file); + + typedef typename KokkosSparse::CrsMatrix crsMat_t; + + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; + typedef typename crsMat_t::index_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; + + row_map_view_t rowmap_view("rowmap_view", nv+1); + cols_view_t columns_view("colsmap_view", ne); + values_view_t values_view("values_view", ne); + + { + typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); + typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); + typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); + + for (default_lno_t i = 0; i <= nv; ++i){ + hr(i) = xadj[i]; + } + + for (default_lno_t i = 0; i < ne; ++i){ + hc(i) = adj[i]; + hv(i) = ew[i]; + } + Kokkos::deep_copy (rowmap_view , hr); + Kokkos::deep_copy (columns_view , hc); + Kokkos::deep_copy (values_view , hv); + } + graph_t static_graph (columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); + + delete [] xadj; + delete [] adj; + delete [] ew; + + run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); +} + int main (int argc, char ** argv){ int cmdline[ CMD_COUNT ] ; - char *mtx_bin_file = NULL; + char *mtx_file = NULL; for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ; for ( int i = 1 ; i < argc ; ++i ) { @@ -283,17 +320,22 @@ int main (int argc, char ** argv){ else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) { cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] ); } + /* else if ( 0 == strcasecmp( argv[i] , "--cores" ) ) { + //Note BMK: specifying #NUMA regions isn't supported by initialize sscanf( argv[++i] , "%dx%d" , cmdline + CMD_USE_NUMA , cmdline + CMD_USE_CORE_PER_NUMA ); } + */ else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) { cmdline[ CMD_USE_CUDA ] = 1 ; } - else if ( 0 == strcasecmp( argv[i] , "--cuda-dev" ) ) { - cmdline[ CMD_USE_CUDA ] = 1 ; - cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ; + else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) { + cmdline[ CMD_USE_HIP ] = 1 ; + } + else if ( 0 == strcasecmp( argv[i] , "--device-id" ) ) { + cmdline[ CMD_DEVICE ] = atoi( argv[++i] ) ; } else if ( 0 == strcasecmp( argv[i] , "--cluster-size" ) ) { cmdline[CMD_CLUSTER_SIZE] = atoi(argv[++i]); @@ -303,12 +345,12 @@ int main (int argc, char ** argv){ } else if ( 0 == strcasecmp( argv[i] , "--mtx" ) ) { - mtx_bin_file = argv[++i]; + mtx_file = argv[++i]; } else { cmdline[ CMD_ERROR ] = 1 ; std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ; - std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; + std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--hip\n\t--device-id[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; return 0; } @@ -317,190 +359,43 @@ int main (int argc, char ** argv){ if(cmdline[CMD_CLUSTER_SIZE] == 0) cmdline[CMD_CLUSTER_SIZE] = 1; - if (mtx_bin_file == NULL){ - std::cerr << "Provide a mtx binary file" << std::endl ; - std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; + if (mtx_file == NULL){ + std::cerr << "Provide a matrix file" << std::endl ; + std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--hip\n\t--device-id[DeviceIndex]\n\t--mtx[matrix]" << std::endl; return 0; } + Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space -#if defined( KOKKOS_ENABLE_THREADS ) - - if ( cmdline[ CMD_USE_THREADS ] ) { - - Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space - - if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) { - init_args.num_threads = cmdline[ CMD_USE_THREADS ]; - init_args.num_numa = cmdline[ CMD_USE_NUMA ]; - //const int core_per_numa = cmdline[ CMD_USE_CORE_PER_NUMA ]; // How to get this to initialize() without using impl_initialize()? - } - else { - init_args.num_threads = cmdline[ CMD_USE_THREADS ]; - } - - Kokkos::initialize( init_args ); - Kokkos::print_configuration(std::cout); - { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; - - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - typedef Kokkos::Threads myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - KokkosKernels::Impl::copy_vector(ne, ew, values_view); - KokkosKernels::Impl::copy_vector(ne, adj, columns_view); - KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); - - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - delete [] xadj; - delete [] adj; - delete [] ew; - - run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); - } + init_args.device_id = cmdline[ CMD_DEVICE ]; + if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) { + init_args.num_threads = std::max(cmdline[ CMD_USE_THREADS ], cmdline [ CMD_USE_OPENMP ]); + init_args.num_numa = cmdline[ CMD_USE_NUMA ]; + } + else { + init_args.num_threads = cmdline[ CMD_USE_THREADS ]; + } - Kokkos::finalize(); - } + Kokkos::initialize( init_args ); + { +#if defined( KOKKOS_ENABLE_THREADS ) + if(cmdline[CMD_USE_THREADS]) + run_pcg(cmdline, mtx_file); #endif - #if defined( KOKKOS_ENABLE_OPENMP ) - - if ( cmdline[ CMD_USE_OPENMP ] ) { - - Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space - - if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) { - init_args.num_threads = cmdline[ CMD_USE_OPENMP ]; - init_args.num_numa = cmdline[ CMD_USE_NUMA ]; - //const int core_per_numa = cmdline[ CMD_USE_CORE_PER_NUMA ]; - } - else { - init_args.num_threads = cmdline[ CMD_USE_OPENMP ]; - } - - Kokkos::initialize( init_args ); - Kokkos::print_configuration(std::cout); - { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; - - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - - typedef Kokkos::OpenMP myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsMat_t::index_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - KokkosKernels::Impl::copy_vector(ne, ew, values_view); - KokkosKernels::Impl::copy_vector(ne, adj, columns_view); - KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); - - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - - //crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj); - delete [] xadj; - delete [] adj; - delete [] ew; - - run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); - } - Kokkos::finalize(); - } + if(cmdline[CMD_USE_OPENMP]) + run_pcg(cmdline, mtx_file); #endif - #if defined( KOKKOS_ENABLE_CUDA ) - if ( cmdline[ CMD_USE_CUDA ] ) { - - Kokkos::InitArguments init_args; // Construct with default args, change members based on exec space - - // Use the last device: - init_args.device_id = cmdline[ CMD_USE_CUDA_DEV ]; - - Kokkos::initialize( init_args ); - Kokkos::print_configuration(std::cout); - { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; - - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - - typedef Kokkos::Cuda myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsMat_t::index_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - - { - typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); - typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); - typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); - - for (INDEX_TYPE i = 0; i <= nv; ++i){ - hr(i) = xadj[i]; - } - - for (INDEX_TYPE i = 0; i < ne; ++i){ - hc(i) = adj[i]; - hv(i) = ew[i]; - } - Kokkos::deep_copy (rowmap_view , hr); - Kokkos::deep_copy (columns_view , hc); - Kokkos::deep_copy (values_view , hv); - - - } - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - - // typedef typename KokkosSparse::CrsMatrix crsMat_t; - // crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj); - delete [] xadj; - delete [] adj; - delete [] ew; - - run_experiment(crsmat, cmdline[CMD_CLUSTER_SIZE], cmdline[CMD_USE_SEQUENTIAL_SGS]); - } - Kokkos::finalize(); - } + if(cmdline[CMD_USE_CUDA]) + run_pcg(cmdline, mtx_file); #endif - +#if defined( KOKKOS_ENABLE_HIP ) + if(cmdline[CMD_USE_HIP]) + run_pcg(cmdline, mtx_file); +#endif + } + Kokkos::finalize(); return 0; } -#else -int main() { -} -#endif diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index f90c6179f7..959e9d973c 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -60,7 +60,7 @@ void print_options(){ std::cerr << "Options\n" << std::endl; - std::cerr << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]'" << std::endl; + std::cerr << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" << std::endl; std::cerr << "\t[Required] --amtx :: 1st input matrix" << std::endl; std::cerr << "\t[Required] --bmtx :: 2nd input matrix" << std::endl; diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp index 80e4ab7c34..0f1c9f6210 100644 --- a/perf_test/sparse/KokkosSparse_spgemm.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm.cpp @@ -52,7 +52,7 @@ void print_options(){ std::cerr << "\t[Required] INPUT MATRIX: '--amtx [left_hand_side.mtx]' -- for C=AxA" << std::endl; - std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' --> if none are specified, Serial is used (if enabled)" << std::endl; + std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]' --> if none are specified, Serial is used (if enabled)" << std::endl; std::cerr << "\t[Optional] '--algorithm [DEFAULT=KKDEFAULT=KKSPGEMM|KKMEM|KKDENSE|MKL|CUSPARSE|CUSP|VIENNA|MKL2]' --> to choose algorithm. KKMEM is outdated, use KKSPGEMM instead." << std::endl; std::cerr << "\t[Optional] --bmtx [righ_hand_side.mtx]' for C = AxB" << std::endl; std::cerr << "\t[Optional] OUTPUT MATRICES: '--cmtx [output_matrix.mtx]' --> to write output C=AxB" << std::endl; @@ -84,6 +84,9 @@ int parse_inputs (KokkosKernels::Experiment::Parameters ¶ms, int argc, char else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) { params.use_cuda = atoi(getNextArg(i, argc, argv)) + 1; } + else if ( 0 == strcasecmp( argv[i] , "--hip" ) ) { + params.use_hip = atoi(getNextArg(i, argc, argv)) + 1; + } else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) { params.repeat = atoi(getNextArg(i, argc, argv)); } @@ -297,7 +300,7 @@ int main (int argc, char ** argv){ } const int num_threads = std::max(params.use_openmp, params.use_threads); - const int device_id = params.use_cuda - 1; + const int device_id = params.use_cuda ? params.use_cuda - 1 : params.use_hip - 1; Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) ); Kokkos::print_configuration(std::cout); @@ -336,6 +339,16 @@ int main (int argc, char ** argv){ } #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (params.use_hip) { + KokkosKernels::Experiment::run_multi_mem_spgemm + ( + params + ); + + } +#endif + #if defined( KOKKOS_ENABLE_THREADS ) //If only serial is enabled (or no other device was specified), run with serial if (params.use_threads) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 3a6ea1cca5..6e4af2c7b3 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -50,6 +50,7 @@ #include #include +#include #ifdef HAVE_KOKKOSKERNELS_QUADMATH # include @@ -63,16 +64,6 @@ #ifdef __CUDACC__ # include #endif -// -// mfh 24 Dec 2013: Temporary measure for testing; will go away. -// -#ifndef KOKKOS_FORCEINLINE_FUNCTION -# ifdef __CUDA_ARCH__ -# define KOKKOS_FORCEINLINE_FUNCTION inline __host__ __device__ -# else -# define KOKKOS_FORCEINLINE_FUNCTION -# endif // __CUDA_ARCH__ -#endif // KOKKOS_FORCEINLINE_FUNCTION namespace { // anonymous diff --git a/src/batched/KokkosBatched_Gemm_Team_Internal.hpp b/src/batched/KokkosBatched_Gemm_Team_Internal.hpp index f4f682cb91..c7e7613769 100644 --- a/src/batched/KokkosBatched_Gemm_Team_Internal.hpp +++ b/src/batched/KokkosBatched_Gemm_Team_Internal.hpp @@ -5,6 +5,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBatched_Set_Internal.hpp" #include "KokkosBatched_Scale_Internal.hpp" @@ -111,7 +112,7 @@ namespace KokkosBatched { member.team_barrier(); /// - /// case cuda: team size is large and blocksize (mb,nb) is small + /// GPU case: team size is large and blocksize (mb,nb) is small InnerGemmFixC inner(as0, as1, bs0, bs1, cs0, cs1); auto gemm = [&](const int ib, const int jb, @@ -128,13 +129,16 @@ namespace KokkosBatched { Kokkos::parallel_for (Kokkos::TeamThreadRange(member, mq*nq ), [&](const int &ij) { -#if \ - defined (KOKKOS_ENABLE_CUDA) && \ - defined (KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) - const int i = ij%mq*mb, j = ij/mq*nb; -#else - const int i = ij/nq*mb, j = ij%nq*nb; -#endif + int i, j; + //note: the condition is constexpr + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + i = ij%mq*mb; + j = ij/mq*nb; + } + else { + i = ij/nq*mb; + j = ij%nq*nb; + } inner.serial_invoke(alpha, AA+i*as0, BB+j*bs1, (i+mb) > ib ? mp : mb, diff --git a/src/batched/KokkosBatched_Trsm_Team_Internal.hpp b/src/batched/KokkosBatched_Trsm_Team_Internal.hpp index 085bd9e293..64d8368f16 100644 --- a/src/batched/KokkosBatched_Trsm_Team_Internal.hpp +++ b/src/batched/KokkosBatched_Trsm_Team_Internal.hpp @@ -5,6 +5,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBatched_Set_Internal.hpp" #include "KokkosBatched_Scale_Internal.hpp" @@ -114,7 +115,7 @@ namespace KokkosBatched { /// case host: team size is small and blocksize (mb,nb) is large /// - /// case cuda: team size is large and blocksize (mb,nb) is small + /// case GPU: team size is large and blocksize (mb,nb) is small InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, bs1); InnerTrsmLeftLowerNonUnitDiag trsm_n(as0, as1, bs0, bs1); @@ -195,7 +196,6 @@ namespace KokkosBatched { const ScalarType alpha, const ValueType *__restrict__ A, const int as0, const int as1, /**/ ValueType *__restrict__ B, const int bs0, const int bs1) { - const ScalarType one(1.0), zero(0.0); // note that parallel range is different ( m*n vs m-1*n); @@ -223,13 +223,15 @@ namespace KokkosBatched { } Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,iend*jend),[&](const int &ij) { -#if \ - defined (KOKKOS_ENABLE_CUDA) && \ - defined (KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) - const int i = ij%iend, j = ij/iend; -#else - const int i = ij/jend, j = ij%jend; -#endif + int i, j; + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + i = ij%iend; + j = ij/iend; + } + else { + i = ij/jend; + j = ij%jend; + } B0[i*bs0+j*bs1] -= a01[i*as0] * b1t[j*bs1]; }); } diff --git a/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp b/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp index 618f8dc614..5bf26f0865 100644 --- a/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp +++ b/src/batched/KokkosBatched_Trsv_Serial_Internal.hpp @@ -99,7 +99,7 @@ namespace KokkosBatched { if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; - /// case cuda: team size is large and blocksize (mb,nb) is small + /// case GPU: team size is large and blocksize (mb,nb) is small InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, 0); InnerTrsmLeftLowerNonUnitDiag trsm_n(as0, as1, bs0, 0); diff --git a/src/batched/KokkosBatched_Trsv_Team_Internal.hpp b/src/batched/KokkosBatched_Trsv_Team_Internal.hpp index 20ee624006..7d72f01e15 100644 --- a/src/batched/KokkosBatched_Trsv_Team_Internal.hpp +++ b/src/batched/KokkosBatched_Trsv_Team_Internal.hpp @@ -115,7 +115,7 @@ namespace KokkosBatched { if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; - /// case cuda: team size is large and blocksize (mb,nb) is small + /// case GPU: team size is large and blocksize (mb,nb) is small InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, 0); InnerTrsmLeftLowerNonUnitDiag trsm_n(as0, as1, bs0, 0); diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 2347c63e87..6d6fe4edbd 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -270,7 +270,7 @@ namespace KokkosBatched { // regieter blocking (not about team parallelism). // this mb should vary according to // - team policy (smaller) or range policy (bigger) - // - space (cuda vs host) + // - space (gpu vs host) // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc. #if defined(KOKKOS_ENABLE_CUDA) template KOKKOS_INLINE_FUNCTION static constexpr diff --git a/src/batched/KokkosBatched_Vector.hpp b/src/batched/KokkosBatched_Vector.hpp index 8737d72850..28a537f885 100644 --- a/src/batched/KokkosBatched_Vector.hpp +++ b/src/batched/KokkosBatched_Vector.hpp @@ -104,6 +104,25 @@ namespace KokkosBatched { }; #endif +#if defined(KOKKOS_ENABLE_HIP) + template<> + struct DefaultVectorLength { + enum : int { value = 16 }; + }; + template<> + struct DefaultVectorLength { + enum : int { value = 16 }; + }; + template<> + struct DefaultVectorLength,Kokkos::Experimental::HIPSpace> { + enum : int { value = 16 }; + }; + template<> + struct DefaultVectorLength,Kokkos::Experimental::HIPSpace> { + enum : int { value = 16 }; + }; +#endif + template struct DefaultInternalVectorLength { enum : int { value = 1 }; @@ -147,6 +166,25 @@ namespace KokkosBatched { enum : int { value = 1 }; }; #endif + +#if defined(KOKKOS_ENABLE_HIP) + template<> + struct DefaultInternalVectorLength { + enum : int { value = 8 }; + }; + template<> + struct DefaultInternalVectorLength { + enum : int { value = 4 }; + }; + template<> + struct DefaultInternalVectorLength,Kokkos::Experimental::HIPSpace> { + enum : int { value = 4 }; + }; + template<> + struct DefaultInternalVectorLength,Kokkos::Experimental::HIPSpace> { + enum : int { value = 2 }; + }; +#endif template struct MagnitudeScalarType; diff --git a/src/batched/KokkosBatched_Vector_SIMD.hpp b/src/batched/KokkosBatched_Vector_SIMD.hpp index d59f0f9be4..e8fe83b7e2 100644 --- a/src/batched/KokkosBatched_Vector_SIMD.hpp +++ b/src/batched/KokkosBatched_Vector_SIMD.hpp @@ -129,7 +129,7 @@ namespace KokkosBatched { } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) namespace KokkosBatched { template<> @@ -143,7 +143,7 @@ namespace KokkosBatched { typedef float2 data_type; KOKKOS_INLINE_FUNCTION - static const char* label() { return "CudaFloat2"; } + static const char* label() { return "GpuFloat2"; } template friend class Vector; @@ -224,7 +224,7 @@ namespace KokkosBatched { typedef double2 data_type; KOKKOS_INLINE_FUNCTION - static const char* label() { return "CudaDouble2"; } + static const char* label() { return "GpuDouble2"; } template friend class Vector; @@ -305,7 +305,7 @@ namespace KokkosBatched { typedef float4 data_type; KOKKOS_INLINE_FUNCTION - static const char* label() { return "CudaFloat4"; } + static const char* label() { return "GpuFloat4"; } template friend class Vector; @@ -400,7 +400,7 @@ namespace KokkosBatched { typedef double4 data_type; KOKKOS_INLINE_FUNCTION - static const char* label() { return "CudaDouble4"; } + static const char* label() { return "GpuDouble4"; } template friend class Vector; diff --git a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp index 95ab97d882..43ddbb101b 100644 --- a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp +++ b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp @@ -77,7 +77,7 @@ namespace KokkosBatched { return r_val; } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -298,7 +298,7 @@ namespace KokkosBatched { return r_val; } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -568,7 +568,7 @@ namespace KokkosBatched { return r_val; } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -858,7 +858,7 @@ namespace KokkosBatched { return r_val; } -#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp index 74d15af1c3..db5bc9fbca 100644 --- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -139,8 +139,8 @@ struct SingleLevelNontransposeGEMV { // matrix A and the input vector x. The output vector y is the // reduction result. // -// WARNING: NOT RECOMMENDED FOR CUDA. Reduction result may have -// arbitrary length. This is bad on CUDA because the CUDA +// WARNING: NOT RECOMMENDED FOR GPU. Reduction result may have +// arbitrary length. This is bad on GPU because the GPU // implementation of Kokkos::parallel_reduce may use shared memory for // intermediate results. template { }; #endif +#ifdef KOKKOS_ENABLE_HIP +template +struct impl_gemm_choose_copy_layout { + typedef LayoutA type; +}; +#endif + // DeepCopy matrix block into scratch template struct impl_deep_copy_matrix_block; diff --git a/src/blas/impl/KokkosBlas3_gemm_spec.hpp b/src/blas/impl/KokkosBlas3_gemm_spec.hpp index 877d73c5fa..2a63c3736f 100644 --- a/src/blas/impl/KokkosBlas3_gemm_spec.hpp +++ b/src/blas/impl/KokkosBlas3_gemm_spec.hpp @@ -157,6 +157,10 @@ struct GEMM { if(std::is_same::value) team_size = blockA0; #endif + #if defined(KOKKOS_ENABLE_HIP) + if(std::is_same::value) + team_size = blockA0; + #endif #if defined(KOKKOS_ENABLE_ROCM) if(std::is_same::value) team_size = blockA0; diff --git a/src/common/KokkosKernels_BitUtils.hpp b/src/common/KokkosKernels_BitUtils.hpp index b22d86a8bb..28b2a01389 100644 --- a/src/common/KokkosKernels_BitUtils.hpp +++ b/src/common/KokkosKernels_BitUtils.hpp @@ -51,6 +51,7 @@ namespace KokkosKernels{ namespace Impl{ // POP COUNT function returns the number of set bits +// Note BMK: HIP also defines __CUDA_ARCH__, and provides the same intrinsics. #if defined( __CUDA_ARCH__ ) KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned i ){ diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp index c0ae6ce5eb..22930c82e1 100644 --- a/src/common/KokkosKernels_ExecSpaceUtils.hpp +++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp @@ -53,9 +53,9 @@ namespace KokkosKernels{ namespace Impl{ -enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA}; +enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA, Exec_HIP}; template -inline ExecSpaceType kk_get_exec_space_type(){ +constexpr KOKKOS_INLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ ExecSpaceType exec_space = Exec_SERIAL; #if defined( KOKKOS_ENABLE_SERIAL ) if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ @@ -81,6 +81,12 @@ inline ExecSpaceType kk_get_exec_space_type(){ } #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (std::is_same::value){ + exec_space = Exec_HIP; + } +#endif + #if defined( KOKKOS_ENABLE_QTHREAD) if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ exec_space = Exec_QTHREADS; @@ -90,6 +96,48 @@ inline ExecSpaceType kk_get_exec_space_type(){ } +template +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { + auto exec = kk_get_exec_space_type(); + //TODO BMK: Add OpenMPTarget and any other future GPU exec spaces + return exec == Exec_CUDA || exec == Exec_HIP; +} + +//Host function to determine free and total device memory. +//Will throw if execution space doesn't support this. +template +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + std::ostringstream oss; + oss << "Error: memory space " << MemorySpace::name() << " does not support querying free/total memory."; + throw std::runtime_error(oss.str()); +} + +#ifdef KOKKOS_ENABLE_CUDA +template <> +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + cudaMemGetInfo(&free_mem, &total_mem); +} +template <> +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + cudaMemGetInfo(&free_mem, &total_mem); +} +template <> +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + cudaMemGetInfo(&free_mem, &total_mem); +} +#endif + +#ifdef KOKKOS_ENABLE_HIP +template <> +inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) +{ + hipMemGetInfo(&free_mem, &total_mem); +} +#endif inline int kk_get_suggested_vector_size( const size_t nr, const size_t nnz, const ExecSpaceType exec_space){ @@ -103,7 +151,7 @@ inline int kk_get_suggested_vector_size( case Exec_QTHREADS: break; case Exec_CUDA: - + case Exec_HIP: if (nr > 0) suggested_vector_size_ = nnz / double (nr) + 0.5; if (suggested_vector_size_ < 3){ @@ -119,7 +167,14 @@ inline int kk_get_suggested_vector_size( suggested_vector_size_ = 16; } else { - suggested_vector_size_ = 32; + if(exec_space == Exec_CUDA || suggested_vector_size_ <= 48) { + //use full CUDA warp, or half a HIP wavefront + suggested_vector_size_ = 32; + } + else { + //use full HIP wavefront + suggested_vector_size_ = 64; + } } break; } @@ -129,7 +184,9 @@ inline int kk_get_suggested_vector_size( inline int kk_get_suggested_team_size(const int vector_size, const ExecSpaceType exec_space){ - if (exec_space == Exec_CUDA){ + if (exec_space == Exec_CUDA || exec_space == Exec_HIP) { + //TODO: where this is used, tune the target value for + //threads per block (but 256 is probably OK for CUDA and HIP) return 256 / vector_size; } else { @@ -171,6 +228,25 @@ struct SpaceInstance { }; #endif +#ifdef KOKKOS_ENABLE_HIP +template <> +struct SpaceInstance { + static Kokkos::Experimental::HIP create() { + hipStream_t stream; + hipStreamCreate(&stream); + return Kokkos::Experimental::HIP(stream); + } + static void destroy(Kokkos::Experimental::HIP& space) { + hipStream_t stream = space.hip_stream(); + hipStreamDestroy(stream); + } + static bool overlap() { + //TODO: does HIP have an equivalent for CUDA_LAUNCH_BLOCKING? + return true; + } +}; +#endif + } } diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp index 9d43ba670c..2e335d4f04 100644 --- a/src/common/KokkosKernels_Handle.hpp +++ b/src/common/KokkosKernels_Handle.hpp @@ -371,7 +371,7 @@ class KokkosKernelsHandle return this->team_work_size; } else { - if (my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (my_exec_space == KokkosKernels::Impl::Exec_CUDA || my_exec_space == KokkosKernels::Impl::Exec_HIP) { return team_size; } else { diff --git a/src/common/KokkosKernels_Macros.hpp b/src/common/KokkosKernels_Macros.hpp index 84de9048c9..ced946fe4f 100644 --- a/src/common/KokkosKernels_Macros.hpp +++ b/src/common/KokkosKernels_Macros.hpp @@ -46,10 +46,10 @@ #define _KOKKOSKERNELS_MACROUTILS_HPP_ // If KOKKOSKERNELS_ENABLE_OMP_SIMD is defined, it's legal to place -// "#pragma omp simd" before a for loop. It's never defined if CUDA is enabled, +// "#pragma omp simd" before a for loop. It's never defined if a GPU-type device is enabled, // since in that case, Kokkos::ThreadVectorRange should be used instead for SIMD parallel loops. -#if !defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_OPENMP) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ENABLE_OPENMP) #if defined(KOKKOS_COMPILER_GNU) // GCC 4.8.5 and older do not support #pragma omp simd #if (KOKKOS_COMPILER_GNU > 485 ) diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp index 2547c2e1b9..7628e6de31 100644 --- a/src/common/KokkosKernels_SparseUtils.hpp +++ b/src/common/KokkosKernels_SparseUtils.hpp @@ -1041,12 +1041,7 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const val { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; -#ifdef KOKKOS_ENABLE_CUDA - //only CUDA benefits from using team-based bitonic - bool useRadix = std::is_same::value ? false : true; -#else - bool useRadix = true; -#endif + bool useRadix = !kk_is_gpu_exec_space(); SortCrsMatrixFunctor funct(useRadix, rowmap, entries, values); lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; @@ -1094,12 +1089,7 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; -#ifdef KOKKOS_ENABLE_CUDA - //only CUDA benefits from using team-based bitonic - bool useRadix = std::is_same::value ? false : true; -#else - bool useRadix = true; -#endif + bool useRadix = !kk_is_gpu_exec_space(); SortCrsGraphFunctor funct(useRadix, rowmap, entries); lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; @@ -1353,74 +1343,45 @@ void kk_sort_graph( out_scalar_view_t out_vals){ ExecSpaceType exec = kk_get_exec_space_type(); - if (exec == Exec_CUDA){ - typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj); - Kokkos::deep_copy (hr, in_xadj); - typename lno_nnz_view_t::HostMirror he = Kokkos::create_mirror_view (in_adj); - Kokkos::deep_copy (he, in_adj); - typename scalar_view_t::HostMirror hv = Kokkos::create_mirror_view (in_vals); - Kokkos::deep_copy (hv, in_vals); - MyExecSpace().fence(); - - typename lno_nnz_view_t::HostMirror heo = Kokkos::create_mirror_view (out_adj); - typename scalar_view_t::HostMirror hvo = Kokkos::create_mirror_view (out_vals); + // If possible, sort on host and avoid a deep copy + // TODO BMK: can this function be deprecated? + typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj); + Kokkos::deep_copy (hr, in_xadj); + typename lno_nnz_view_t::HostMirror he = Kokkos::create_mirror_view (in_adj); + Kokkos::deep_copy (he, in_adj); + typename scalar_view_t::HostMirror hv = Kokkos::create_mirror_view (in_vals); + Kokkos::deep_copy (hv, in_vals); + MyExecSpace().fence(); + typename lno_nnz_view_t::HostMirror heo = Kokkos::create_mirror_view (out_adj); + typename scalar_view_t::HostMirror hvo = Kokkos::create_mirror_view (out_vals); - typedef typename lno_view_t::non_const_value_type size_type; - typedef typename lno_nnz_view_t::non_const_value_type lno_t; - typedef typename scalar_view_t::non_const_value_type scalar_t; + typedef typename lno_view_t::non_const_value_type size_type; + typedef typename lno_nnz_view_t::non_const_value_type lno_t; + typedef typename scalar_view_t::non_const_value_type scalar_t; - lno_t nrows = in_xadj.extent(0) - 1; - std::vector > edges(in_adj.extent(0)); + lno_t nrows = in_xadj.extent(0) - 1; + std::vector > edges(in_adj.extent(0)); - size_type row_size = 0; - for (lno_t i = 0; i < nrows; ++i){ - for (size_type j = hr(i); j < hr(i + 1); ++j){ - edges[row_size].src = i; - edges[row_size].dst = he(j); - edges[row_size++].ew = hv(j); - } - } - std::sort (edges.begin(), edges.begin() + row_size); - size_type ne = in_adj.extent(0); - for(size_type i = 0; i < ne; ++i){ - heo(i) = edges[i].dst; - hvo(i) = edges[i].ew; + size_type row_size = 0; + for (lno_t i = 0; i < nrows; ++i){ + for (size_type j = hr(i); j < hr(i + 1); ++j){ + edges[row_size].src = i; + edges[row_size].dst = he(j); + edges[row_size++].ew = hv(j); } - - - Kokkos::deep_copy (out_adj, heo); - Kokkos::deep_copy (out_vals, hvo); - MyExecSpace().fence(); } - else { - - - typedef typename lno_view_t::non_const_value_type size_type; - typedef typename lno_nnz_view_t::non_const_value_type lno_t; - typedef typename scalar_view_t::non_const_value_type scalar_t; - - lno_t nrows = in_xadj.extent(0) - 1; - std::vector > edges(in_adj.extent(0)); - - size_type row_size = 0; - for (lno_t i = 0; i < nrows; ++i){ - for (size_type j = in_xadj(i); j < in_xadj(i + 1); ++j){ - edges[row_size].src = i; - edges[row_size].dst = in_adj(j); - edges[row_size++].ew = in_vals(j); - } - } - std::sort (edges.begin(), edges.begin() + row_size); - size_type ne = in_adj.extent(0); - for(size_type i = 0; i < ne; ++i){ - out_adj(i) = edges[i].dst; - out_vals(i) = edges[i].ew; - } - + std::sort (edges.begin(), edges.begin() + row_size); + size_type ne = in_adj.extent(0); + for(size_type i = 0; i < ne; ++i){ + heo(i) = edges[i].dst; + hvo(i) = edges[i].ew; + } - } + Kokkos::deep_copy (out_adj, heo); + Kokkos::deep_copy (out_vals, hvo); + MyExecSpace().fence(); } /* @@ -1714,47 +1675,46 @@ struct LowerTriangularMatrix{ const size_type write_end = t_xadj[row_index + 1]; const lno_t write_left_work = write_end - write_begin; - switch (exec_space){ - case Exec_CUDA: - //TODO: Write cuda version here. - /* + //TODO: Write GPU (vector-level) version here: + /* + if(kk_is_gpu_exec_space()) + { Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, read_left_work), [&] (lno_t i) { const size_type adjind = i + col_begin; const lno_t colIndex = adj[adjind]; - }); - */ + } + else + ... + */ - default: - for (lno_t r = 0 , w = 0; r < read_left_work && w < write_left_work; ++r){ - const size_type adjind = r + col_begin; - const lno_t colIndex = adj[adjind]; - lno_t colperm = colIndex; - if (permutation != NULL){ - colperm = permutation[colIndex]; - } - if (is_lower){ - if (row_perm > colperm){ - if (in_vals != NULL){ - t_vals[write_begin + w] = in_vals[adjind]; - } - t_adj[write_begin + w++] = colIndex; + for (lno_t r = 0 , w = 0; r < read_left_work && w < write_left_work; ++r){ + const size_type adjind = r + col_begin; + const lno_t colIndex = adj[adjind]; + lno_t colperm = colIndex; + if (permutation != NULL){ + colperm = permutation[colIndex]; + } + if (is_lower){ + if (row_perm > colperm){ + if (in_vals != NULL){ + t_vals[write_begin + w] = in_vals[adjind]; } + t_adj[write_begin + w++] = colIndex; } - else { - if (row_perm < colperm){ - if (in_vals != NULL){ - t_vals[write_begin + w] = in_vals[adjind]; - } - t_adj[write_begin + w++] = colIndex; + } + else { + if (row_perm < colperm){ + if (in_vals != NULL){ + t_vals[write_begin + w] = in_vals[adjind]; } + t_adj[write_begin + w++] = colIndex; } + } - } - break; } }); } @@ -2340,7 +2300,6 @@ void kk_create_incidence_tranpose_matrix_from_lower_triangle( bool use_dynamic_scheduling = false, bool chunksize = 4){ -#ifndef KOKKOS_ENABLE_CUDA //typedef typename row_map_view_t::const_type const_row_map_view_t; //typedef typename cols_view_t::const_type const_cols_view_t; @@ -2381,7 +2340,6 @@ void kk_create_incidence_tranpose_matrix_from_lower_triangle( } }); -#endif } template void get_suggested_vector_size( int &suggested_vector_size_, - idx nr, idx nnz){ - - suggested_vector_size_ = 1; - -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ - suggested_vector_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - suggested_vector_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - suggested_vector_size_ = 1; - } -#endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - - suggested_vector_size_ = nnz / double (nr) + 0.5; - - if (suggested_vector_size_ <= 3){ - suggested_vector_size_ = 2; - } - else if (suggested_vector_size_ <= 6){ - suggested_vector_size_ = 4; - } - else if (suggested_vector_size_ <= 12){ - suggested_vector_size_ = 8; - } - else if (suggested_vector_size_ <= 24){ - suggested_vector_size_ = 16; - } - else { - suggested_vector_size_ = 32; - } - } -#endif - -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ - suggested_vector_size_ = 1; - } -#endif - + idx nr, idx nnz) { + suggested_vector_size_ = kk_get_suggested_vector_size(nr, nnz, get_exec_space_type()); } //Get the best team size for the given functor. @@ -152,34 +103,28 @@ void get_suggested_vector_size( template int get_suggested_team_size(Functor& f, int vector_size) { -#ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) + using execution_space = typename team_policy_t::traits::execution_space; + if(kk_is_gpu_exec_space()) { team_policy_t temp(1, 1, vector_size); return temp.team_size_recommended(f, ParallelTag()); } else -#endif - { return 1; - } } template int get_suggested_team_size(Functor& f, int vector_size, size_t sharedPerTeam, size_t sharedPerThread) { -#ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) + using execution_space = typename team_policy_t::traits::execution_space; + if(kk_is_gpu_exec_space()) { team_policy_t temp = team_policy_t(1, 1, vector_size). set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam), Kokkos::PerThread(sharedPerThread)); return temp.team_size_recommended(f, ParallelTag()); } else -#endif - { return 1; - } } template nnz_lno_persistent_work_view_t; typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type - typedef Kokkos::TeamPolicy team_policy_t ; + typedef Kokkos::TeamPolicy team_policy_t ; typedef typename team_policy_t::member_type team_member_t ; typedef typename Kokkos::View non_const_1d_size_type_view_t; @@ -229,54 +229,17 @@ class GraphColoringHandle } - /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise. + /** \brief Chooses best algorithm based on the execution space. COLORING_SERIAL if serial, otherwise COLORING_VBBIT. + * VBBIT is the fastest parallel algorithm (unless on GPU and the graph's maximum degree is very large, but + * we don't have information about the graph here) */ void choose_default_algorithm() { -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ + auto exec = KokkosKernels::Impl::kk_get_exec_space_type(); + if(exec == KokkosKernels::Impl::Exec_SERIAL) this->coloring_algorithm_type = COLORING_SERIAL; -#ifdef VERBOSE - std::cout << "Serial Execution Space, Default Algorithm: COLORING_VB" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - this->coloring_algorithm_type = COLORING_VB; -#ifdef VERBOSE - std::cout << "PTHREAD Execution Space, Default Algorithm: COLORING_VB" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - this->coloring_algorithm_type = COLORING_VB; -#ifdef VERBOSE - std::cout << "OpenMP Execution Space, Default Algorithm: COLORING_VB" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - this->coloring_algorithm_type = COLORING_EB; -#ifdef VERBOSE - std::cout << "Cuda Execution Space, Default Algorithm: COLORING_VB" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ - this->coloring_algorithm_type = COLORING_VB; -#ifdef VERBOSE - std::cout << "Qthread Execution Space, Default Algorithm: COLORING_VB" << std::endl; -#endif - } -#endif + else + this->coloring_algorithm_type = COLORING_VBBIT; } template @@ -463,7 +426,7 @@ class GraphColoringHandle row_index_view_type xadj, nonzero_view_type adj){ KokkosKernels::Impl::symmetrize_and_get_lower_diagonal_edge_list - + ( nv, xadj, @@ -496,13 +459,8 @@ class GraphColoringHandle size_type_temp_work_view_t lower_count("LowerXADJ", nv + 1); size_type new_num_edge = 0; - typedef Kokkos::RangePolicy my_exec_space; - - if ( false -#if defined( KOKKOS_ENABLE_CUDA ) - || std::is_same::value -#endif - ) + typedef Kokkos::RangePolicy my_exec_space; + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { @@ -522,10 +480,10 @@ class GraphColoringHandle clt//, new_num_edge ); - KokkosKernels::Impl::inclusive_parallel_prefix_sum + KokkosKernels::Impl::inclusive_parallel_prefix_sum (nv+1, lower_count); //Kokkos::parallel_scan (my_exec_space(0, nv + 1), PPS(lower_count)); - HandleExecSpace().fence(); + ExecutionSpace().fence(); auto lower_total_count = Kokkos::subview(lower_count, nv); auto hlower = Kokkos::create_mirror_view (lower_total_count); Kokkos::deep_copy (hlower, lower_total_count); @@ -551,7 +509,7 @@ class GraphColoringHandle //Kokkos::parallel_scan (my_exec_space(0, nv + 1), PPS(lower_count)); - KokkosKernels::Impl::inclusive_parallel_prefix_sum + KokkosKernels::Impl::inclusive_parallel_prefix_sum (nv+1, lower_count); nnz_lno_persistent_work_view_t half_src (Kokkos::ViewAllocateWithoutInitializing("HALF SRC"),new_num_edge); nnz_lno_persistent_work_view_t half_dst (Kokkos::ViewAllocateWithoutInitializing("HALF DST"),new_num_edge); diff --git a/src/graph/KokkosGraph_Distance2ColorHandle.hpp b/src/graph/KokkosGraph_Distance2ColorHandle.hpp index f4624f545b..4c392051fb 100644 --- a/src/graph/KokkosGraph_Distance2ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance2ColorHandle.hpp @@ -198,71 +198,17 @@ class GraphColorDistance2Handle * Chooses best algorithm based on the execution space. * * This chooses the best algorithm based on the execution space: - * - COLORING_D2_SERIAL if the execution space is SERIAL - * - COLORING_D2_NB_BIT otherwise + * - COLORING_D2_SERIAL if the execution space is SERIAL (more work efficient than NB_BIT) + * - COLORING_D2_NB_BIT otherwise (fastest parallel algorithm) * */ void choose_default_algorithm() { - bool found = false; -#if defined(KOKKOS_ENABLE_SERIAL) - if(std::is_same::value) - { + if(KokkosKernels::Impl::kk_get_exec_space_type() == KokkosKernels::Impl::Exec_SERIAL) this->coloring_algorithm_type = COLORING_D2_SERIAL; - found = true; -#ifdef VERBOSE - std::cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL" << std::endl; -#endif - } -#endif - -#if defined(KOKKOS_ENABLE_THREADS) - if(std::is_same::value) - { - this->coloring_algorithm_type = COLORING_D2_NB_BIT; - found = true; -#ifdef VERBOSE - std::cout << "PTHREAD Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl; -#endif - } -#endif - -#if defined(KOKKOS_ENABLE_OPENMP) - if(std::is_same::value) - { - this->coloring_algorithm_type = COLORING_D2_NB_BIT; - found = true; -#ifdef VERBOSE - std::cout << "OpenMP Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl; -#endif - } -#endif - -#if defined(KOKKOS_ENABLE_CUDA) - if(std::is_same::value) - { - this->coloring_algorithm_type = COLORING_D2_NB_BIT; - found = true; -#ifdef VERBOSE - std::cout << "Cuda Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl; -#endif - } -#endif - -#if defined(KOKKOS_ENABLE_QTHREAD) - if(std::is_same::value) - { + else this->coloring_algorithm_type = COLORING_D2_NB_BIT; - found = true; -#ifdef VERBOSE - std::cout << "Qthread Execution Space, Default Algorithm: COLORING_D2_NB_BIT" << std::endl; -#endif - } -#endif - //Since this logic is based on checking every exec space, detect when a new one needs to be supported - if(!found) - throw std::logic_error("D2 coloring: default algorithm hasn't been chosen for the current execution space"); } diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index 0a5493df7d..866ad54daf 100644 --- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -396,7 +396,7 @@ struct D2_MIS_RandomPriority Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(colWorklist)); worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); - bool useTeams = (execSpaceEnum == KokkosKernels::Impl::Exec_CUDA) && (entries.extent(0) / numVerts >= 16); + bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && (entries.extent(0) / numVerts >= 16); int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum); int round = 0; lno_t rowWorkLen = numVerts; diff --git a/src/sparse/KokkosSparse_CrsMatrix.hpp b/src/sparse/KokkosSparse_CrsMatrix.hpp index 938d6e91be..c618d3add6 100644 --- a/src/sparse/KokkosSparse_CrsMatrix.hpp +++ b/src/sparse/KokkosSparse_CrsMatrix.hpp @@ -104,6 +104,12 @@ inline int RowsPerThread(const int NNZPerRow) { return 1; } #endif +#ifdef KOKKOS_ENABLE_HIP +template<> +inline int RowsPerThread(const int NNZPerRow) { + return 1; +} +#endif // A simple struct for storing a kernel launch configuration. // This is currently used by CrsMatrix to allow the user to have some control diff --git a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp index 2def3a17f1..fd4a9b58d9 100644 --- a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp +++ b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp @@ -274,53 +274,11 @@ namespace KokkosSparse{ void set_block_size(nnz_lno_t bs){this->block_size = bs; } nnz_lno_t get_block_size() const {return this->block_size;} - /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise. - */ void choose_default_algorithm(){ -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ - this->algorithm_type = GS_PERMUTED; -#ifdef VERBOSE - std::cout << "Serial Execution Space, Default Algorithm: GS_PERMUTED" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - this->algorithm_type = GS_PERMUTED; -#ifdef VERBOSE - std::cout << "PTHREAD Execution Space, Default Algorithm: GS_PERMUTED" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - this->algorithm_type = GS_PERMUTED; -#ifdef VERBOSE - std::cout << "OpenMP Execution Space, Default Algorithm: GS_PERMUTED" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) this->algorithm_type = GS_TEAM; -#ifdef VERBOSE - std::cout << "Cuda Execution Space, Default Algorithm: GS_TEAM" << std::endl; -#endif - } -#endif - -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ + else this->algorithm_type = GS_PERMUTED; -#ifdef VERBOSE - std::cout << "Qthread Execution Space, Default Algorithm: GS_PERMUTED" << std::endl; -#endif - } -#endif } ~PointGaussSeidelHandle() = default; @@ -559,33 +517,7 @@ namespace KokkosSparse{ bool use_teams() const { - bool return_value = false; -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value) { - return_value = false; - } -#endif -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - return_value = false; - } -#endif -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - return_value = false; - } -#endif -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - return_value = true; - } -#endif -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ - return_value = false; - } -#endif - return return_value; + return KokkosKernels::Impl::kk_is_gpu_exec_space(); } ~ClusterGaussSeidelHandle() = default; diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp index 820afbbaa3..9ed66ce2ad 100644 --- a/src/sparse/KokkosSparse_spadd.hpp +++ b/src/sparse/KokkosSparse_spadd.hpp @@ -202,67 +202,6 @@ struct UnmergedSumFunctor { CcolindsT ABperm; }; -template -struct SortEntriesFunctor { - SortEntriesFunctor(const CrowptrsT& Crowptrs_, const CcolindsT& Ccolinds_, - const CcolindsT& ABperm_) - : Crowptrs(Crowptrs_), - Ccolinds(Ccolinds_), - CcolindsAux("C colind aux", Ccolinds_.extent(0)), - ABperm(ABperm_), - ABpermAux("AB perm aux", ABperm_.extent(0)) {} - typedef typename Kokkos::TeamPolicy::member_type TeamMember; - KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { - // 3: Sort each row's colinds (permuting values at same time), then count - // unique colinds (write that to Crowptr(i)) CrowptrTemp tells how many - // entries in each oversized row - ordinal_type i = t.league_rank(); - size_type rowStart = Crowptrs(i); - size_type rowEnd = Crowptrs(i + 1); - size_type rowNum = rowEnd - rowStart; - using lno_t = typename CcolindsT::non_const_value_type; - using unsigned_lno_t = typename std::make_unsigned::type; - KokkosKernels::Impl::SerialRadixSort2( - (unsigned_lno_t*)Ccolinds.data() + rowStart, - (unsigned_lno_t*)CcolindsAux.data() + rowStart, - ABperm.data() + rowStart, ABpermAux.data() + rowStart, rowNum); - } - CrowptrsT Crowptrs; - CcolindsT Ccolinds; - CcolindsT CcolindsAux; - CcolindsT ABperm; - CcolindsT ABpermAux; -}; - -#ifdef KOKKOS_ENABLE_CUDA -template -struct SortEntriesFunctor { - SortEntriesFunctor(const CrowptrsT& Crowptrs_, CcolindsT& Ccolinds_, - CcolindsT& ABperm_) - : Crowptrs(Crowptrs_), Ccolinds(Ccolinds_), ABperm(ABperm_) {} - typedef typename Kokkos::TeamPolicy::member_type TeamMember; - KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { - // 3: Sort each row's colinds (permuting values at same time), then count - // unique colinds (write that to Crowptr(i)) CrowptrTemp tells how many - // entries in each oversized row - size_type i = t.league_rank(); - size_type rowStart = Crowptrs(i); - size_type rowEnd = Crowptrs(i + 1); - size_type rowNum = rowEnd - rowStart; - KokkosKernels::Impl::TeamBitonicSort2< - size_type, typename CcolindsT::non_const_value_type, - typename CcolindsT::non_const_value_type, TeamMember>( - Ccolinds.data() + rowStart, ABperm.data() + rowStart, rowNum, t); - } - CrowptrsT Crowptrs; - CcolindsT Ccolinds; - CcolindsT ABperm; -}; -#endif - template struct MergeEntriesFunctor { diff --git a/src/sparse/KokkosSparse_spgemm_handle.hpp b/src/sparse/KokkosSparse_spgemm_handle.hpp index b34d349457..f517682d5e 100644 --- a/src/sparse/KokkosSparse_spgemm_handle.hpp +++ b/src/sparse/KokkosSparse_spgemm_handle.hpp @@ -504,8 +504,6 @@ class SPGEMMHandle{ return this->cuSPARSEHandle; } #endif - /** \brief Chooses best algorithm based on the execution space. COLORING_EB if cuda, COLORING_VB otherwise. - */ void choose_default_algorithm(){ #if defined( KOKKOS_ENABLE_SERIAL ) if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ @@ -543,6 +541,15 @@ class SPGEMMHandle{ } #endif +#if defined( KOKKOS_ENABLE_HIP ) + if (std::is_same::value){ + this->algorithm_type = SPGEMM_KK; +#ifdef VERBOSE + std::cout << "HIP Execution Space, Default Algorithm: SPGEMM_KK" << std::endl; +#endif + } +#endif + #if defined( KOKKOS_ENABLE_QTHREAD) if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ this->algorithm_type = SPGEMM_SERIAL; @@ -604,67 +611,20 @@ class SPGEMMHandle{ //suggested_vector_size_=this->suggested_vector_size = 1; //return; if (this->suggested_team_size && this->suggested_vector_size) { + //already set in the handle suggested_vector_size_ = this->suggested_vector_size; suggested_team_size_ = this->suggested_team_size; return; } -#if defined( KOKKOS_ENABLE_SERIAL ) - if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ - suggested_vector_size_ = this->suggested_vector_size = 1; - suggested_team_size_ = this->suggested_team_size = max_allowed_team_size; - return; - } -#endif - -#if defined( KOKKOS_ENABLE_THREADS ) - if (std::is_same< Kokkos::Threads , ExecutionSpace >::value){ - suggested_vector_size_ = this->suggested_vector_size = 1; - suggested_team_size_ = this->suggested_team_size = max_allowed_team_size; - return; - } -#endif - -#if defined( KOKKOS_ENABLE_OPENMP ) - if (std::is_same< Kokkos::OpenMP, ExecutionSpace >::value){ - suggested_vector_size_ = this->suggested_vector_size = 1; - suggested_team_size_ = this->suggested_team_size = max_allowed_team_size; - } -#endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - - this->suggested_vector_size = nnz / double (nr) + 0.5; - - if (this->suggested_vector_size <= 3){ - this->suggested_vector_size = 2; - } - else if (this->suggested_vector_size <= 6){ - this->suggested_vector_size = 4; - } - else if (this->suggested_vector_size <= 12){ - this->suggested_vector_size = 8; - } - else if (this->suggested_vector_size <= 24){ - this->suggested_vector_size = 16; - } - else { - this->suggested_vector_size = 32; - } - - suggested_vector_size_ = this->suggested_vector_size; - this->suggested_team_size= suggested_team_size_ = max_allowed_team_size / this->suggested_vector_size; - } -#endif - -#if defined( KOKKOS_ENABLE_QTHREAD) - if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ - suggested_vector_size_ = this->suggested_vector_size = 1; - suggested_team_size_ = this->suggested_team_size = max_allowed_team_size; - } -#endif - + //otherwise, recompute team_size/vector_size based on heuristic and save them in the handle + suggested_vector_size_ = KokkosKernels::Impl::kk_get_suggested_vector_size(nr, nnz, KokkosKernels::Impl::kk_get_exec_space_type()); + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) + suggested_team_size_ = max_allowed_team_size / suggested_vector_size_; + else + suggested_team_size = max_allowed_team_size; + this->suggested_vector_size = suggested_vector_size_; + this->suggested_team_size = suggested_vector_size_; } void set_compression_steps(bool isCompressionSingleStep){ diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 03eef00e4d..d956ed8d4d 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -554,9 +554,8 @@ namespace KokkosSparse{ } }); -#if !defined(__CUDA_ARCH__) #if KOKKOSSPARSE_IMPL_PRINTDEBUG - if (/*i == 0 && ii == 1*/ ii == 0 || (block_size == 1 && ii < 2) ){ + if (!KokkosKernels::Impl::kk_is_gpu_exec_space() && (ii == 0 || (block_size == 1 && ii < 2))){ std::cout << "\n\n\nrow:" << ii * block_size + i; std::cout << "\nneighbors:"; for (nnz_lno_t z = 0; z < block_row_size; ++z){ @@ -573,7 +572,6 @@ namespace KokkosSparse{ std::cout << std::endl << "block_row_index:" << ii * block_size + i << " _Xvector(block_row_index):" << _Xvector(ii * block_size + i, vec) << std::endl << std::endl<< std::endl; } -#endif #endif //row_begin += row_size * block_size; } @@ -737,31 +735,16 @@ namespace KokkosSparse{ timer.reset(); #endif - -#if defined( KOKKOS_ENABLE_CUDA ) - if (std::is_same::value){ - for (nnz_lno_t i = 0; i < numColors; ++i){ - nnz_lno_t color_index_begin = h_color_xadj(i); - nnz_lno_t color_index_end = h_color_xadj(i + 1); - - if (color_index_begin + 1 >= color_index_end ) continue; - auto colorsubset = - subview(color_adj, Kokkos::pair (color_index_begin, color_index_end)); - MyExecSpace().fence(); - Kokkos::sort (colorsubset); - //TODO: MD 08/2017: If I remove the below fence, code fails on cuda. - //I do not see any reason yet it to fail. - MyExecSpace().fence(); - } - } -#endif - - MyExecSpace().fence(); + // TODO BMK: Why are the vertices in each color set only being sorted on GPU? + // Wouldn't it have a locality benefit on CPU too? + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + KokkosKernels::Impl::sort_crs_graph(color_xadj, color_adj); + MyExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - std::cout << "SORT_TIME:" << timer.seconds() << std::endl; - timer.reset(); - //std::cout << "sort" << std::endl; + std::cout << "SORT_TIME:" << timer.seconds() << std::endl; + timer.reset(); #endif + } row_lno_persistent_work_view_t permuted_xadj ("new xadj", num_rows + 1); nnz_lno_persistent_work_view_t old_to_new_map ("old_to_new_index_", num_rows ); @@ -844,7 +827,7 @@ namespace KokkosSparse{ nnz_lno_t num_big_rows = 0; KokkosKernels::Impl::ExecSpaceType ex_sp = this->handle->get_handle_exec_space(); - if (ex_sp != KokkosKernels::Impl::Exec_CUDA){ + if (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { //again, if it is on CPUs, we make L1 as big as we need. size_t l1mem = 1; while(l1mem < level_1_mem){ @@ -882,12 +865,11 @@ namespace KokkosSparse{ num_big_rows = KOKKOSKERNELS_MACRO_MIN(num_large_rows, (size_type)(MyExecSpace::concurrency() / suggested_vector_size)); //std::cout << "num_big_rows:" << num_big_rows << std::endl; -#if defined( KOKKOS_ENABLE_CUDA ) - if (ex_sp == KokkosKernels::Impl::Exec_CUDA) { + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { //check if we have enough memory for this. lower the concurrency if we do not have enugh memory. size_t free_byte ; size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); size_t required_size = size_t (num_big_rows) * level_2_mem; if (required_size + num_big_rows * sizeof(int) > free_byte){ num_big_rows = ((((free_byte - num_big_rows * sizeof(int))* 0.8) /8 ) * 8) / level_2_mem; @@ -900,7 +882,6 @@ namespace KokkosSparse{ num_big_rows = min_chunk_size; } } -#endif } } @@ -1165,7 +1146,7 @@ namespace KokkosSparse{ // change fill_matrix_numeric so that they store the internal matrix as above. // the rest will wok fine. - if (this->handle->get_handle_exec_space() == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::Team_fill_matrix_numeric", team_policy_t(num_rows / rows_per_team + 1 , suggested_team_size, suggested_vector_size), fill_matrix_numeric( @@ -1209,7 +1190,7 @@ namespace KokkosSparse{ block_size, block_matrix_size); - if (this->handle->get_handle_exec_space() == KokkosKernels::Impl::Exec_CUDA || block_size > 1){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space() || block_size > 1){ Kokkos::parallel_for("KokkosSparse::GaussSeidel::team_get_matrix_diagonals", team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), gmd ); diff --git a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp index 0ef887d80e..af10787c46 100644 --- a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp +++ b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp @@ -74,535 +74,6 @@ struct IotaFunctor View v; }; -template -struct RCM -{ - typedef typename HandleType::HandleExecSpace MyExecSpace; - typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; - typedef typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; - - typedef typename HandleType::size_type size_type; - typedef typename HandleType::nnz_lno_t nnz_lno_t; - - typedef typename lno_row_view_t::const_type const_lno_row_view_t; - typedef typename lno_row_view_t::non_const_type non_const_lno_row_view_t; - typedef typename non_const_lno_row_view_t::value_type offset_t; - - typedef typename lno_nnz_view_t::const_type const_lno_nnz_view_t; - typedef typename lno_nnz_view_t::non_const_type non_const_lno_nnz_view_t; - - typedef typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t; - typedef typename HandleType::row_lno_persistent_work_view_t row_lno_persistent_work_view_t; - typedef typename HandleType::row_lno_persistent_work_host_view_t row_lno_persistent_work_host_view_t; //Host view type - - typedef typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; - typedef typename HandleType::nnz_lno_persistent_work_view_t nnz_lno_persistent_work_view_t; - typedef typename HandleType::nnz_lno_persistent_work_host_view_t nnz_lno_persistent_work_host_view_t; //Host view type - - typedef nnz_lno_persistent_work_view_t nnz_view_t; - typedef Kokkos::View> single_view_t; - typedef Kokkos::View> single_view_host_t; - - typedef Kokkos::RangePolicy my_exec_space; - - typedef Kokkos::Device device_t; - - typedef Kokkos::RangePolicy range_policy_t ; - typedef Kokkos::TeamPolicy team_policy_t ; - typedef typename team_policy_t::member_type team_member_t ; - - typedef nnz_lno_t LO; - - RCM(size_type numRows_, lno_row_view_t& rowmap_, lno_nnz_view_t& colinds_) - : numRows(numRows_), rowmap(rowmap_), colinds(colinds_) - {} - - nnz_lno_t numRows; - const_lno_row_view_t rowmap; - const_lno_nnz_view_t colinds; - - //radix sort keys according to their corresponding values ascending. - //keys are NOT preserved since the use of this in RCM doesn't care about degree after sorting - template - KOKKOS_INLINE_FUNCTION static void - radixSortKeysAndValues(KeyType* keys, KeyType* keysAux, ValueType* values, ValueType* valuesAux, IndexType n, const member_t& mem) - { - if(n <= 1) - return; - //sort 4 bits at a time - KeyType mask = 0xF; - bool inAux = false; - //maskPos counts the low bit index of mask (0, 4, 8, ...) - IndexType maskPos = 0; - IndexType sortBits = 0; - KeyType minKey = Kokkos::ArithTraits::max(); - KeyType maxKey = 0; - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(mem, n), - [=](size_type i, KeyType& lminkey) - { - if(keys[i] < lminkey) - lminkey = keys[i]; - }, Kokkos::Min(minKey)); - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(mem, n), - [=](size_type i, KeyType& lmaxkey) - { - if(keys[i] > lmaxkey) - lmaxkey = keys[i]; - }, Kokkos::Max(maxKey)); - //apply a bias so that key range always starts at 0 - //also invert key values here for a descending sort - Kokkos::parallel_for(Kokkos::ThreadVectorRange(mem, n), - [=](size_type i) - { - keys[i] -= minKey; - }); - KeyType upperBound = maxKey - minKey; - while(upperBound) - { - upperBound >>= 1; - sortBits++; - } - for(IndexType s = 0; s < (sortBits + 3) / 4; s++) - { - //Count the number of elements in each bucket - IndexType count[16] = {0}; - IndexType offset[17]; - if(!inAux) - { - for(IndexType i = 0; i < n; i++) - { - count[(keys[i] & mask) >> maskPos]++; - } - } - else - { - for(IndexType i = 0; i < n; i++) - { - count[(keysAux[i] & mask) >> maskPos]++; - } - } - offset[0] = 0; - //get offset as the prefix sum for count - for(IndexType i = 0; i < 16; i++) - { - offset[i + 1] = offset[i] + count[i]; - } - //now for each element in [lo, hi), move it to its offset in the other buffer - //this branch should be ok because whichBuf is the same on all threads - if(!inAux) - { - //copy from *Over to *Aux - for(IndexType i = 0; i < n; i++) - { - IndexType bucket = (keys[i] & mask) >> maskPos; - keysAux[offset[bucket + 1] - count[bucket]] = keys[i]; - valuesAux[offset[bucket + 1] - count[bucket]] = values[i]; - count[bucket]--; - } - } - else - { - //copy from *Aux to *Over - for(IndexType i = 0; i < n; i++) - { - IndexType bucket = (keysAux[i] & mask) >> maskPos; - keys[offset[bucket + 1] - count[bucket]] = keysAux[i]; - values[offset[bucket + 1] - count[bucket]] = valuesAux[i]; - count[bucket]--; - } - } - inAux = !inAux; - mask = mask << 4; - maskPos += 4; - } - //move keys/values back from aux if they are currently in aux, - //and remove bias - if(inAux) - { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(mem, n), - [=](size_type i) - { - //TODO: when everything works, is safe to remove next line - //since keys (BFS visit scores) will never be needed again - keys[i] = keysAux[i]; - values[i] = valuesAux[i]; - }); - } - } - - //Functor that does breadth-first search on a sparse graph. - struct BfsFunctor - { - typedef Kokkos::View> WorkView; - - BfsFunctor(const WorkView& workQueue_, const WorkView& scratch_, const nnz_view_t& visit_, const const_lno_row_view_t& rowmap_, const const_lno_nnz_view_t& colinds_, const single_view_t& numLevels_, const nnz_view_t& threadNeighborCounts_, nnz_lno_t start_, nnz_lno_t numRows_) - : workQueue(workQueue_), scratch(scratch_), visit(visit_), rowmap(rowmap_), colinds(colinds_), numLevels(numLevels_), threadNeighborCounts(threadNeighborCounts_), start(start_), numRows(numRows_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(const team_member_t mem) const - { - const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); - const nnz_lno_t NOT_VISITED = LNO_MAX; - const nnz_lno_t QUEUED = NOT_VISITED - 1; - int nthreads = mem.team_size(); - nnz_lno_t tid = mem.team_rank(); - auto neighborList = Kokkos::subview(scratch, tid, Kokkos::ALL()); - //active and next indicate which buffer in workQueue holds the nodes in current/next frontiers, respectively - //active, next and visitCounter are thread-local, but always kept consistent across threads - int active = 0; - int next = 1; - nnz_lno_t visitCounter = 0; - Kokkos::single(Kokkos::PerTeam(mem), - [&]() - { - workQueue(active, 0) = start; - visit(start) = QUEUED; - }); - nnz_lno_t activeQSize = 1; - nnz_lno_t nextQSize = 0; - //KK create_reverse_map() expects incoming values to start at 1 - nnz_lno_t level = 1; - //do this until all nodes have been visited and added to a level - while(visitCounter < numRows) - { - mem.team_barrier(); - //each thread works on a contiguous block of nodes in queue (for locality) - //compute in size_t to avoid possible 32-bit overflow - nnz_lno_t workStart = tid * activeQSize / nthreads; - nnz_lno_t workEnd = (tid + 1) * activeQSize / nthreads; - //the maximum work batch size (among all threads) - //the following loop contains barriers so all threads must iterate same # of times - nnz_lno_t maxBatch = (activeQSize + nthreads - 1) / nthreads; - for(nnz_lno_t loop = 0; loop < maxBatch; loop++) - { - //this thread may not actually have anything to work on (if nthreads doesn't divide qSize) - bool busy = loop < workEnd - workStart; - nnz_lno_t neiCount = 0; - nnz_lno_t process = LNO_MAX; - if(busy) - { - process = workQueue(active, workStart + loop); - offset_t rowStart = rowmap(process); - offset_t rowEnd = rowmap(process + 1); - //build a list of all non-visited neighbors - for(offset_t j = rowStart; j < rowEnd; j++) - { - nnz_lno_t col = colinds(j); - //use atomic here to guarantee neighbors are added to neighborList exactly once - if(col < numRows && Kokkos::atomic_compare_exchange_strong(&visit(col), NOT_VISITED, QUEUED)) - { - //this thread is the first to see that col needs to be queued - neighborList(neiCount) = col; - neiCount++; - } - } - } - threadNeighborCounts(tid) = neiCount; - mem.team_barrier(); - size_type queueUpdateOffset = 0; - for(nnz_lno_t i = 0; i < tid; i++) - { - queueUpdateOffset += threadNeighborCounts(i); - } - //write out all updates to next queue in parallel - if(busy) - { - nnz_lno_t nextQueueIter = 0; - for(nnz_lno_t i = 0; i < neiCount; i++) - { - nnz_lno_t toQueue = neighborList(i); - visit(toQueue) = QUEUED; - workQueue(next, nextQSize + queueUpdateOffset + nextQueueIter) = toQueue; - nextQueueIter++; - } - //assign level to to process - visit(process) = level; - } - nnz_lno_t totalAdded = 0; - for(nnz_lno_t i = 0; i < nthreads; i++) - { - totalAdded += threadNeighborCounts(i); - } - nextQSize += totalAdded; - mem.team_barrier(); - } - //swap queue buffers - active = next; - next = 1 - next; - //all threads have a consistent value of qSize here. - //update visitCounter in preparation for next frontier - visitCounter += activeQSize; - activeQSize = nextQSize; - nextQSize = 0; - if(visitCounter < numRows && activeQSize == 0) - { - Kokkos::single(Kokkos::PerTeam(mem), - [&]() - { - //Some nodes are unreachable from start (graph not connected) - //Find an unvisited node to resume BFS - for(nnz_lno_t search = numRows - 1; search >= 0; search--) - { - if(visit(search) == NOT_VISITED) - { - workQueue(active, 0) = search; - visit(search) = QUEUED; - break; - } - } - }); - activeQSize = 1; - } - level++; - } - Kokkos::single(Kokkos::PerTeam(mem), - [&] - { - numLevels() = level - 1; - }); - } - - WorkView workQueue; - WorkView scratch; - nnz_view_t visit; - const_lno_row_view_t rowmap; - const_lno_nnz_view_t colinds; - single_view_t numLevels; - nnz_view_t threadNeighborCounts; - nnz_lno_t start; - nnz_lno_t numRows; - }; - - //Parallel breadth-first search, producing level structure in (xadj, adj) form: - //xadj(level) gives index in adj where level begins. - //Returns the total number of levels, and sets xadj, adj and maxDeg. - nnz_lno_t parallel_bfs(nnz_lno_t start, nnz_view_t& xadj, nnz_view_t& adj, nnz_lno_t& maxDeg, nnz_lno_t nthreads) - { - //need to know maximum degree to allocate scratch space for threads - maxDeg = KokkosKernels::Impl::graph_max_degree(rowmap); - //view for storing the visit timestamps - nnz_view_t visit("BFS visited nodes", numRows); - const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); - const nnz_lno_t NOT_VISITED = LNO_MAX; - KokkosBlas::fill(visit, NOT_VISITED); - //the visit queue - //one of q1,q2 is active at a time and holds the nodes to process in next BFS level - //elements which are LNO_MAX are just placeholders (nothing to process) - Kokkos::View> workQueue("BFS queue (double buffered)", 2, numRows); - nnz_view_t threadNeighborCounts("Number of nodes to queue on each thread", nthreads); - single_view_t numLevels("# of BFS levels"); - single_view_host_t numLevelsHost("# of BFS levels"); - Kokkos::View> scratch("Scratch buffer shared by threads", nthreads, maxDeg); - Kokkos::parallel_for(team_policy_t(1, nthreads), BfsFunctor(workQueue, scratch, visit, rowmap, colinds, numLevels, threadNeighborCounts, start, numRows)); - Kokkos::deep_copy(numLevelsHost, numLevels); - //now that level structure has been computed, construct xadj/adj - KokkosKernels::Impl::create_reverse_map - (numRows, numLevelsHost(), visit, xadj, adj); - return numLevelsHost(); - } - - struct CuthillMcKeeFunctor - { - typedef Kokkos::View> ScoreView; - - CuthillMcKeeFunctor(nnz_lno_t numLevels_, nnz_lno_t maxDegree_, const const_lno_row_view_t& rowmap_, const const_lno_nnz_view_t& colinds_, const ScoreView& scores_, const ScoreView& scoresAux_, const nnz_view_t& visit_, const nnz_view_t& xadj_, const nnz_view_t& adj_, const nnz_view_t& adjAux_) - : numLevels(numLevels_), maxDegree(maxDegree_), rowmap(rowmap_), colinds(colinds_), scores(scores_), scoresAux(scoresAux_), visit(visit_), xadj(xadj_), adj(adj_), adjAux(adjAux_) - { - numRows = rowmap.extent(0) - 1; - } - - KOKKOS_INLINE_FUNCTION void operator()(const team_member_t mem) const - { - int tid = mem.team_rank(); - int nthreads = mem.team_size(); - const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); - nnz_lno_t visitCounter = 0; - for(nnz_lno_t level = 0; level < numLevels; level++) - { - //iterate over vertices in this level and compute - //min predecessors (minimum-labeled vertices from previous level) - nnz_lno_t levelOffset = xadj(level); - nnz_lno_t levelSize = xadj(level + 1) - levelOffset; - //compute as offset_t to avoid overflow, but the upper bound on - //the scores is approx. numRows * maxDegree, which should be representable - nnz_lno_t workStart = tid * levelSize / nthreads; - nnz_lno_t workEnd = (tid + 1) * levelSize / nthreads; - for(nnz_lno_t i = workStart; i < workEnd; i++) - { - nnz_lno_t process = adj(levelOffset + i); - nnz_lno_t minNeighbor = LNO_MAX; - offset_t rowStart = rowmap(process); - offset_t rowEnd = rowmap(process + 1); - for(offset_t j = rowStart; j < rowEnd; j++) - { - nnz_lno_t neighbor = colinds(j); - if(neighbor < numRows) - { - nnz_lno_t neighborVisit = visit(neighbor); - if(neighborVisit < minNeighbor) - minNeighbor = neighborVisit; - } - } - scores(i) = ((offset_t) minNeighbor * (maxDegree + 1)) + (rowmap(process + 1) - rowmap(process)); - } - mem.team_barrier(); - Kokkos::single(Kokkos::PerTeam(mem), - [&]() - { - radixSortKeysAndValues - (scores.data(), scoresAux.data(), adj.data() + levelOffset, adjAux.data(), levelSize, mem); - }); - mem.team_barrier(); - //label all vertices (which are now in label order within their level) - for(nnz_lno_t i = workStart; i < workEnd; i++) - { - nnz_lno_t process = adj(levelOffset + i); - //visit counter increases with levels, so flip the range for the "reverse" in RCM - visit(process) = visitCounter + i; - } - visitCounter += levelSize; - } - } - - nnz_lno_t numRows; - nnz_lno_t numLevels; - nnz_lno_t maxDegree; - const_lno_row_view_t rowmap; - const_lno_nnz_view_t colinds; - ScoreView scores; - ScoreView scoresAux; - nnz_view_t visit; - //The levels, stored in CRS format. - //xadj stores offsets for each level, and adj stores the rows in each level. - nnz_view_t xadj; - nnz_view_t adj; - nnz_view_t adjAux; - }; - - //Does the reversing in "reverse Cuthill-McKee") - struct OrderReverseFunctor - { - OrderReverseFunctor(const nnz_view_t& visit_, nnz_lno_t numRows_) - : visit(visit_), numRows(numRows_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const - { - visit(i) = numRows - visit(i) - 1; - } - nnz_view_t visit; - nnz_lno_t numRows; - }; - - //breadth-first search, producing a reverse Cuthill-McKee ordering - nnz_view_t parallel_cuthill_mckee(nnz_lno_t start) - { - size_type nthreads = MyExecSpace::concurrency(); - if(nthreads > 64) - nthreads = 64; - #ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) - { - nthreads = 256; - } - #endif - nnz_view_t xadj, adj; - nnz_lno_t maxDegree = 0; - //parallel_bfs will compute maxDegree - auto numLevels = parallel_bfs(start, xadj, adj, maxDegree, nthreads); - //xadj determines where each level set starts and begins, - //so its max 'degree' gives the size of the largest level - nnz_lno_t maxLevelSize = KokkosKernels::Impl::graph_max_degree(xadj); - std::cout << "Maximum size of a level set: " << maxLevelSize << '\n'; - //visit (to be returned) contains the RCM numberings of each row - nnz_view_t visit("RCM labels", numRows); - //Populate visit wth LNO_MAX so that the "min-labeled neighbor" - //is always a node in the previous level - const nnz_lno_t LNO_MAX = Kokkos::ArithTraits::max(); - KokkosBlas::fill(visit, LNO_MAX); - //the "score" of a node is a single value that provides an ordering equivalent - //to sorting by min predecessor and then by min degree - //reduce nthreads to be a power of 2 - Kokkos::View> scores("RCM scores for sorting", maxLevelSize); - Kokkos::View> scoresAux("RCM scores for sorting (radix sort aux)", maxLevelSize); - nnz_view_t adjAux("RCM scores for sorting (radix sort aux)", maxLevelSize); - Kokkos::parallel_for(team_policy_t(1, nthreads), CuthillMcKeeFunctor(numLevels, maxDegree, rowmap, colinds, scores, scoresAux, visit, xadj, adj, adjAux)); - //reverse the visit order (for the 'R' in RCM) - Kokkos::parallel_for(range_policy_t(0, numRows), OrderReverseFunctor(visit, numRows)); - return visit; - } - - template - struct MinDegreeRowFunctor - { - typedef typename Reducer::value_type Value; - MinDegreeRowFunctor(const const_lno_row_view_t& rowmap_) : rowmap(rowmap_) {} - KOKKOS_INLINE_FUNCTION void operator()(const size_type i, Value& lval) const - { - size_type ideg = rowmap(i + 1) - rowmap(i); - if(ideg < lval.val) - { - lval.val = ideg; - lval.loc = i; - } - } - const_lno_row_view_t rowmap; - }; - - //parallel-for functor that assigns a cluster given a envelope-reduced reordering (like RCM) - struct OrderToClusterFunctor - { - OrderToClusterFunctor(const nnz_view_t& ordering_, const nnz_view_t& vertClusters_, nnz_lno_t clusterSize_) - : ordering(ordering_), vertClusters(vertClusters_), clusterSize(clusterSize_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const - { - vertClusters(i) = ordering(i) / clusterSize; - } - - const nnz_view_t ordering; - nnz_view_t vertClusters; - nnz_lno_t clusterSize; - }; - - //Find a peripheral node (one of minimal degree), suitable for starting RCM or BFS - nnz_lno_t find_peripheral() - { - typedef Kokkos::MinLoc MinLocReducer; - typedef typename MinLocReducer::value_type MinLocVal; - MinLocVal v; - Kokkos::parallel_reduce(range_policy_t(0, numRows), - MinDegreeRowFunctor(rowmap), MinLocReducer(v)); - return v.loc; - } - - nnz_view_t cuthill_mckee() - { - nnz_lno_t periph = find_peripheral(); - //run Cuthill-McKee BFS from periph - auto ordering = parallel_cuthill_mckee(periph); - return ordering; - } - - nnz_view_t rcm() - { - nnz_view_t cm = cuthill_mckee(); - //reverse the visit order (for the 'R' in RCM) - Kokkos::parallel_for(range_policy_t(0, numRows), OrderReverseFunctor(cm, numRows)); - return cm; - } - - nnz_view_t cm_cluster(nnz_lno_t clusterSize) - { - nnz_view_t cm = cuthill_mckee(); - nnz_view_t vertClusters("Vert to cluster", numRows); - OrderToClusterFunctor makeClusters(cm, vertClusters, clusterSize); - Kokkos::parallel_for(range_policy_t(0, numRows), makeClusters); - return vertClusters; - } -}; - template struct BalloonClustering { diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp index 6d240d11b3..c881c98ed4 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp @@ -219,6 +219,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } @@ -761,6 +765,7 @@ bool KokkosSPGEMM { //get the execution space type. KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); //get the suggested vectorlane size based on the execution space, and average number of nnzs per row. int suggested_vector_size = this->handle->get_suggested_vector_size(n, nnz); //get the suggested team size. @@ -791,7 +796,7 @@ bool KokkosSPGEMM out_nnz_view_t set_nexts_; out_nnz_view_t set_begins_; #ifdef KOKKOSKERNELSMOREMEM - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { set_nexts_ = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_nexts_"), nnz); set_begins_ = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_begins_"), nnz); Kokkos::deep_copy (set_begins_, -1); @@ -804,8 +809,9 @@ bool KokkosSPGEMM } //if compressing in single step, allocate the memory as upperbound. - //TODO: two step is not there for cuda. - if (compress_in_single_step || lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + //TODO: two step is not there for GPU. + + if (compress_in_single_step || exec_gpu) { out_nnz_indices = out_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("set_entries_"), nnz); out_nnz_sets = out_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("set_indices_"), nnz); } @@ -834,7 +840,8 @@ bool KokkosSPGEMM timer1.reset(); //bool compression_applied = false; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + #ifndef KOKKOSKERNELSMOREMEM size_type max_row_nnz = 0; @@ -856,27 +863,23 @@ bool KokkosSPGEMM size_t num_chunks = concurrency / suggested_vector_size; -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks*sizeof(int) > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - size_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } -#endif + if (exec_gpu) { + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); + size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); + if (KOKKOSKERNELS_VERBOSE) + std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; + if (required_size + num_chunks*sizeof(int) > free_byte){ + num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; + } + { + size_t min_chunk_size = 1; + while (min_chunk_size * 2 <= num_chunks) { + min_chunk_size *= 2; + } + num_chunks = min_chunk_size; + } + } if (KOKKOSKERNELS_VERBOSE){ std::cout << "\t\tPOOL chunksize:" << chunksize << " num_chunks:" diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp index aa73c1e55b..4924e11b0c 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp @@ -124,10 +124,9 @@ void KokkosSPGEMM KokkosKernels::Impl::ExecSpaceType my_exec_space_ = KokkosKernels::Impl::get_exec_space_type(); bool compress_in_single_step = this->handle->get_spgemm_handle()->get_compression_step(); - //compress in single step if it is cuda execution space. - if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA) { + //compress in single step if it is GPU. + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) compress_in_single_step = true; - } //compressed B fields. row_lno_temp_work_view_t new_row_mapB(Kokkos::ViewAllocateWithoutInitializing("new row map"), n+1); diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp index 3f29c39e4e..38fce91b1b 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp @@ -234,6 +234,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -1244,7 +1248,7 @@ void //choose parameters if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){ - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { //then chose the best method and parameters. size_type average_row_nnz = overall_nnz / this->a_row_cnt; size_t average_row_flops = original_overall_flops / this->a_row_cnt; @@ -1374,7 +1378,7 @@ void //required memory for L2 - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){ tmp_max_nnz = 1; @@ -1419,12 +1423,9 @@ void } int num_chunks = concurrency / suggested_vector_size; -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); if (KOKKOSKERNELS_VERBOSE) std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; @@ -1439,7 +1440,6 @@ void num_chunks = min_chunk_size; } } -#endif // END SIZE CALCULATIONS FOR MEMORYPOOL @@ -1455,7 +1455,7 @@ void KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } @@ -1505,7 +1505,7 @@ void } timer1.reset(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){ if (thread_shmem_key_size <= 0) { std::cout << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl; @@ -1617,7 +1617,7 @@ void KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } @@ -1667,7 +1667,7 @@ void } timer1.reset(); - if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY2", gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1 , suggested_team_size, suggested_vector_size), sc); MyExecSpace().fence(); } diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp index 415bd1ed3a..e3a4f492a6 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp @@ -143,6 +143,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -481,7 +485,7 @@ struct KokkosSPGEMM // // Policy typedefs with tags found in: KokkosSparse_spgemm_impl.hpp // -// if Cuda enabled : +// if GPU: // "KokkosSparse::NumericCMEM::KKSPEED::GPU" : gpu_team_policy_t, i.e. GPUTag // // else : @@ -519,7 +523,7 @@ void Kokkos::Impl::Timer numeric_speed_timer_with_free; - if (my_exec_space_ == KokkosKernels::Impl::Exec_CUDA){ + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { //allocate memory for begins and next to be used by the hashmap nnz_lno_temp_work_view_t beginsC (Kokkos::ViewAllocateWithoutInitializing("C keys"), valuesC_.extent(0)); diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp index 9f4f7ec753..29dbb5c477 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp @@ -210,6 +210,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -785,6 +789,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -1493,13 +1501,14 @@ void KokkosSPGEMM ){ SPGEMMAlgorithm current_spgemm_algorithm = this->spgemm_algorithm; + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) + if (exec_gpu) { current_spgemm_algorithm = SPGEMM_KK_MEMORY; } maxNumRoughNonzeros = KOKKOSKERNELS_MACRO_MIN(this->b_col_cnt, maxNumRoughNonzeros); - int shmem_size_to_use = shmem_size; + int shmem_size_to_use = shmem_size; typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space; @@ -1511,7 +1520,7 @@ void KokkosSPGEMM int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz); //this kernel does not really work well if the vector size is less than 4. - if (suggested_vector_size < 4 && lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (suggested_vector_size < 4 && exec_gpu) { if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tsuggested_vector_size:" << suggested_vector_size << " setting it to 4 for Structure kernel" << std::endl; } @@ -1522,7 +1531,7 @@ void KokkosSPGEMM if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){ - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu){ //then chose the best method and parameters. current_spgemm_algorithm = SPGEMM_KK_MEMORY; int estimate_compress = 8; @@ -1635,31 +1644,28 @@ void KokkosSPGEMM //initizalize value for the mem pool nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } + if (exec_gpu) { + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); + size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); + if (KOKKOSKERNELS_VERBOSE) + std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; + if (required_size + num_chunks > free_byte){ + num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; + } + { + nnz_lno_t min_chunk_size = 1; + while (min_chunk_size * 2 <= num_chunks) { + min_chunk_size *= 2; + } + num_chunks = min_chunk_size; + } } -#endif if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl; @@ -1705,8 +1711,8 @@ void KokkosSPGEMM timer1.reset(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - Kokkos::parallel_for("StructureC_NC::CUDA_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); + if (exec_gpu) { + Kokkos::parallel_for("StructureC_NC::GPU_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); } else { if (current_spgemm_algorithm == SPGEMM_KK_DENSE){ @@ -1791,8 +1797,9 @@ void KokkosSPGEMM ){ SPGEMMAlgorithm current_spgemm_algorithm = this->spgemm_algorithm; + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { current_spgemm_algorithm = SPGEMM_KK_MEMORY; } @@ -1800,7 +1807,7 @@ void KokkosSPGEMM nnz_lno_t brows = row_mapB_.extent(0) - 1; size_type bnnz = entriesSetIndex.extent(0); size_type compressed_b_size = bnnz; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { KokkosKernels::Impl::kk_reduce_diff_view (brows, old_row_mapB, row_mapB_, compressed_b_size); if (KOKKOSKERNELS_VERBOSE){ @@ -1810,7 +1817,7 @@ void KokkosSPGEMM int suggested_vector_size = this->handle->get_suggested_vector_size(brows, compressed_b_size); //this kernel does not really work well if the vector size is less than 4. - if (suggested_vector_size < 4 && lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (suggested_vector_size < 4 && exec_gpu) { if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tsuggested_vector_size:" << suggested_vector_size << " setting it to 4 for Structure kernel" << std::endl; } @@ -1821,7 +1828,7 @@ void KokkosSPGEMM int shmem_size_to_use = shmem_size; if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm){ - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { //then chose the best method and parameters. current_spgemm_algorithm = SPGEMM_KK_MEMORY; int estimate_compress = 8; @@ -1951,7 +1958,7 @@ void KokkosSPGEMM } - if (current_spgemm_algorithm == SPGEMM_KK_DENSE && lcl_my_exec_space != KokkosKernels::Impl::Exec_CUDA){ + if (current_spgemm_algorithm == SPGEMM_KK_DENSE && !exec_gpu) { nnz_lno_t col_size = this->b_col_cnt / (sizeof (nnz_lno_t) * 8)+ 1; nnz_lno_t max_row_size = KOKKOSKERNELS_MACRO_MIN(col_size, maxNumRoughNonzeros); chunksize = col_size + max_row_size; @@ -1966,16 +1973,14 @@ void KokkosSPGEMM nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; + if (exec_gpu) { + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); if (KOKKOSKERNELS_VERBOSE) std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; @@ -1990,7 +1995,6 @@ void KokkosSPGEMM num_chunks = min_chunk_size; } } -#endif if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl; @@ -2035,7 +2039,7 @@ void KokkosSPGEMM timer1.reset(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { Kokkos::parallel_for("KokkosSparse::StructureC::GPU_EXEC", gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); } else { @@ -2584,6 +2588,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp index d8997fcc12..27c0f4c7d9 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp @@ -219,6 +219,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -1322,17 +1326,17 @@ void KokkosSPGEMM ){ bool apply_compression = this->handle->get_spgemm_handle()->get_compression(); + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); const nnz_lno_t * min_result_row_for_each_row = this->handle->get_spgemm_handle()->get_min_col_of_row().data(); nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz(); typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space; - int suggested_vector_size = this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz); //this kernel does not really work well if the vector size is less than 4. - if (suggested_vector_size < 4 && MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){ + if (suggested_vector_size < 4 && exec_gpu) { if (KOKKOSKERNELS_VERBOSE) std::cout << "\tVecSize:" << suggested_vector_size << " Setting it to 4" << std::endl; suggested_vector_size = 4; } @@ -1414,29 +1418,27 @@ void KokkosSPGEMM nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - -#if defined( KOKKOS_ENABLE_CUDA ) - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 < num_chunks) { - min_chunk_size *= 2; + if(exec_gpu) { + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); + size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t); + if (KOKKOSKERNELS_VERBOSE) + std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; + if (required_size + num_chunks > free_byte){ + num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize; + } + { + nnz_lno_t min_chunk_size = 1; + while (min_chunk_size * 2 < num_chunks) { + min_chunk_size *= 2; + } + num_chunks = min_chunk_size; } - num_chunks = min_chunk_size; } -#endif if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << @@ -1486,8 +1488,7 @@ void KokkosSPGEMM timer1.reset(); - //nnz_lno_t runcuda = atoi(getenv("runcuda")); - if (/*runcuda ||*/ MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { Kokkos::parallel_for( gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); } else { @@ -1682,6 +1683,7 @@ void KokkosSPGEMM b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>:: KokkosSPGEMM_symbolic_triangle_setup(){ + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); nnz_lno_t n = this->row_mapB.extent(0) - 1; size_type nnz = this->entriesB.extent(0); @@ -1733,7 +1735,7 @@ void KokkosSPGEMM } size_type bnnz = set_index_entries.extent(0); - if (this->MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { KokkosKernels::Impl::kkp_reduce_diff_view (this->b_row_cnt, p_rowmapB_begins, p_rowmapB_ends, bnnz); if (KOKKOSKERNELS_VERBOSE){ diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp index e59b95e8ac..ae913f864a 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp @@ -215,6 +215,10 @@ struct KokkosSPGEMM #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -892,12 +896,13 @@ void KokkosSPGEMM const int num_left_side_nnz_per_row = 2; const nnz_lno_t * min_result_row_for_each_row = this->handle->get_spgemm_handle()->get_min_col_of_row().data(); nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz(); + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space; int suggested_vector_size = this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz); //this kernel does not really work well if the vector size is less than 4. - if (suggested_vector_size < 4 && MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA){ + if (suggested_vector_size < 4 && exec_gpu) { if (KOKKOSKERNELS_VERBOSE) std::cout << "\tVecSize:" << suggested_vector_size << " Setting it to 4" << std::endl; suggested_vector_size = 4; } @@ -960,29 +965,24 @@ void KokkosSPGEMM nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; - } - - -#if defined( KOKKOS_ENABLE_CUDA ) - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; - size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 < num_chunks) { - min_chunk_size *= 2; + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); + size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t); + if (KOKKOSKERNELS_VERBOSE) + std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; + if (required_size + num_chunks > free_byte){ + num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize; + } + { + nnz_lno_t min_chunk_size = 1; + while (min_chunk_size * 2 < num_chunks) { + min_chunk_size *= 2; + } + num_chunks = min_chunk_size; } - num_chunks = min_chunk_size; } -#endif if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << @@ -1032,9 +1032,7 @@ void KokkosSPGEMM timer1.reset(); - //nnz_lno_t runcuda = atoi(getenv("runcuda")); - - if (/*runcuda ||*/ MyEnumExecSpace == KokkosKernels::Impl::Exec_CUDA) { + if (exec_gpu) { Kokkos::parallel_for( gpu_team_policy_t(m / suggested_team_size + 1 , suggested_team_size, suggested_vector_size), sc); } else { diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp index a32d6689b9..2e12457822 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp @@ -219,6 +219,10 @@ namespace KokkosSparse{ #if defined( KOKKOS_ENABLE_CUDA ) case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined( KOKKOS_ENABLE_HIP ) + case KokkosKernels::Impl::Exec_HIP: + return row_index; #endif } } @@ -1181,6 +1185,8 @@ namespace KokkosSparse{ dinv_view_t dinv, KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space) { + using pool_memory_space = KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t>; + constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tSPARSE ACC MODE" << std::endl; } @@ -1238,7 +1244,7 @@ namespace KokkosSparse{ // Choose the SpGEMM algorithm and corresponding parameters if (this->spgemm_algorithm == SPGEMM_KK || this->spgemm_algorithm == SPGEMM_KK_LP){ - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { size_type average_row_nnz = overall_nnz / this->a_row_cnt; size_t average_row_flops = original_overall_flops / this->a_row_cnt; @@ -1310,7 +1316,7 @@ namespace KokkosSparse{ } } } - // If CUDA is not enabled, we decide whether we want to use a sparse or a dense acumulator + // If non-GPU, we decide whether we want to use a sparse or a dense acumulator else { bool run_dense = false; @@ -1364,7 +1370,7 @@ namespace KokkosSparse{ // Compute the memory pool size - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){ tmp_max_nnz = 1; } @@ -1397,11 +1403,9 @@ namespace KokkosSparse{ } int num_chunks = concurrency / suggested_vector_size; -#if defined( KOKKOS_ENABLE_CUDA ) - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA) { - size_t free_byte ; - size_t total_byte ; - cudaMemGetInfo( &free_byte, &total_byte ) ; + if (exec_gpu) { + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); if (KOKKOSKERNELS_VERBOSE) std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; @@ -1414,7 +1418,6 @@ namespace KokkosSparse{ } num_chunks = min_chunk_size; } -#endif if (KOKKOSKERNELS_VERBOSE){ std::cout << "\t\t max_nnz: " << max_nnz @@ -1428,11 +1431,10 @@ namespace KokkosSparse{ // Allocate the memory pool KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space; Kokkos::Impl::Timer timer; pool_memory_space m_space(num_chunks, chunksize, -1, my_pool_type); MyExecSpace().fence(); @@ -1470,7 +1472,7 @@ namespace KokkosSparse{ } timer.reset(); - if (lcl_my_exec_space == KokkosKernels::Impl::Exec_CUDA){ + if (exec_gpu) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM){ if (thread_shmem_key_size <= 0) { std::cout << "KokkosSPGEMM_jacobi_sparseacc SPGEMM_KK_MEMORY_SPREADTEAM: Insufficient shmem available for key for hash map accumulator - Terminating" << std::endl; diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index b14f781320..3389577497 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -48,6 +48,7 @@ #include "KokkosKernels_Controls.hpp" #include "Kokkos_InnerProductSpaceTraits.hpp" #include "KokkosBlas1_scal.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_spmv_impl_omp.hpp" @@ -113,37 +114,30 @@ struct SPMV_Transpose_Functor { KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - // This should be a thread loop as soon as we can use C++11 - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { + const ordinal_type threadWork = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) + * rows_per_thread; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread), + [&](ordinal_type loop) + { // iRow represents a row of the matrix, so its correct type is // ordinal_type. - const ordinal_type iRow = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread + loop; + const ordinal_type iRow = threadWork + loop; if (iRow >= m_A.numRows ()) { return; } const auto row = m_A.rowConst (iRow); const ordinal_type row_length = row.length; - -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row_length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row_length; - iEntry ++) -#endif + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length), + [&](ordinal_type iEntry) { const value_type val = conjugate ? ATV::conj (row.value(iEntry)) : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); - Kokkos::atomic_add (&m_y(ind), static_cast (alpha * val * m_x(iRow))); - } - } + }); + }); } }; @@ -234,11 +228,9 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th // Determine rows per thread if(rows_per_thread < 1) { - #ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) rows_per_thread = 1; else - #endif { if(nnz_per_row < 20 && nnz > 5000000 ) { rows_per_thread = 256; @@ -247,14 +239,12 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th } } - #ifdef KOKKOS_ENABLE_CUDA if(team_size < 1) { - if(std::is_same::value) + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { team_size = 256/vector_length; } else { team_size = 1; } } - #endif rows_per_team = rows_per_thread * team_size; @@ -469,12 +459,14 @@ struct SPMV_MV_Transpose_Functor { KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - // This should be a thread loop as soon as we can use C++11 - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { + const ordinal_type threadWork = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) + * rows_per_thread; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread), + [&](ordinal_type loop) + { // iRow represents a row of the matrix, so its correct type is // ordinal_type. - const ordinal_type iRow = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread + loop; + const ordinal_type iRow = threadWork + loop; if (iRow >= m_A.numRows ()) { return; } @@ -482,15 +474,8 @@ struct SPMV_MV_Transpose_Functor { const auto row = m_A.rowConst (iRow); const ordinal_type row_length = row.length; -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < static_cast (row_length); - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row_length; - iEntry ++) -#endif + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length), + [&](ordinal_type iEntry) { const A_value_type val = conjugate ? Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : @@ -514,8 +499,8 @@ struct SPMV_MV_Transpose_Functor { static_cast (val * m_x(iRow, k))); } } - } - } + }); + }); } }; @@ -527,7 +512,7 @@ template struct SPMV_MV_LayoutLeft_Functor { typedef typename AMatrix::execution_space execution_space; - typedef typename AMatrix::non_const_ordinal_type ordinal_type; + typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::non_const_value_type A_value_type; typedef typename YVector::non_const_value_type y_value_type; typedef typename Kokkos::TeamPolicy team_policy; @@ -542,21 +527,23 @@ struct SPMV_MV_LayoutLeft_Functor { //! The number of columns in the input and output MultiVectors. ordinal_type n; ordinal_type rows_per_thread; + int vector_length; SPMV_MV_LayoutLeft_Functor (const coefficient_type& alpha_, const AMatrix& m_A_, const XVector& m_x_, const coefficient_type& beta_, const YVector& m_y_, - const ordinal_type rows_per_thread_) : + const ordinal_type rows_per_thread_, + int vector_length_) : alpha (alpha_), m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)), - rows_per_thread (rows_per_thread_) + rows_per_thread (rows_per_thread_), vector_length(vector_length_) {} template KOKKOS_INLINE_FUNCTION void - strip_mine (const team_member& /* dev */, const ordinal_type& iRow, const ordinal_type& kk) const + strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const { y_value_type sum[UNROLL]; @@ -586,133 +573,80 @@ struct SPMV_MV_LayoutLeft_Functor { #ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT #pragma loop count (15) #endif -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row.length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row.length; - iEntry ++) -#endif - { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length), + [&](ordinal_type iEntry) + { const A_value_type val = conjugate ? Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); - #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { sum[k] += val * m_x(ind, kk + k); } - } + }); if (doalpha == -1) { for (int ii=0; ii < UNROLL; ++ii) { - y_value_type sumt = sum[ii]; -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sumt += Kokkos::shfl_down(sumt, 1,blockDim.x); - if (blockDim.x > 2) - sumt += Kokkos::shfl_down(sumt, 2,blockDim.x); - if (blockDim.x > 4) - sumt += Kokkos::shfl_down(sumt, 4,blockDim.x); - if (blockDim.x > 8) - sumt += Kokkos::shfl_down(sumt, 8,blockDim.x); - if (blockDim.x > 16) - sumt += Kokkos::shfl_down(sumt, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - sum[ii] = -sumt; + y_value_type sumt; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length), + [&](ordinal_type, y_value_type& lsum) + { + //in this context, sum[ii] is a partial sum ii on one of the vector lanes. + lsum -= sum[ii]; + }, sumt); + sum[ii] = sumt; + //that was an all-reduce, so sum[ii] is the same on every vector lane } } else { for (int ii=0; ii < UNROLL; ++ii) { - y_value_type sumt = sum[ii]; -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sumt += Kokkos::shfl_down(sumt, 1,blockDim.x); - if (blockDim.x > 2) - sumt += Kokkos::shfl_down(sumt, 2,blockDim.x); - if (blockDim.x > 4) - sumt += Kokkos::shfl_down(sumt, 4,blockDim.x); - if (blockDim.x > 8) - sumt += Kokkos::shfl_down(sumt, 8,blockDim.x); - if (blockDim.x > 16) - sumt += Kokkos::shfl_down(sumt, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - sum[ii] = sumt; + y_value_type sumt; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length), + [&](ordinal_type, y_value_type& lsum) + { + //in this context, sum[ii] is a partial sum ii on one of the vector lanes. + lsum += sum[ii]; + }, sumt); + if(doalpha == 1) + sum[ii] = sumt; + else + sum[ii] = sumt * alpha; } } -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (threadIdx.x==0) -#else - if (true) -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - { - if (doalpha * doalpha != 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - sum[k] *= alpha; - } - } - - if (dobeta == 0) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = sum[k]; - } - } else if (dobeta == 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) += sum[k]; - } - } else if (dobeta == -1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; - } - } else { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; - } - } + if (dobeta == 0) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = sum[k]; + }); + } else if (dobeta == 1) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = sum[k]; + }); + } else if (dobeta == -1) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; + }); + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; + }); } } KOKKOS_INLINE_FUNCTION void - strip_mine_1 (const team_member& /* dev */, const ordinal_type& iRow) const + strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const { - y_value_type sum = Kokkos::Details::ArithTraits::zero (); - const auto row = m_A.rowConst (iRow); // The correct type of iEntry is ordinal_type, the type of the @@ -720,48 +654,17 @@ struct SPMV_MV_LayoutLeft_Functor { // assume either that rows have no duplicate entries, or that rows // never have enough duplicate entries to overflow ordinal_type. -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT -#pragma loop count (15) -#endif -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row.length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row.length; - iEntry ++) -#endif + y_value_type sum; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, row.length), + [&](ordinal_type iEntry, y_value_type& lsum) { const A_value_type val = conjugate ? Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : row.value(iEntry); - sum += val * m_x(row.colidx(iEntry),0); - } -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sum += Kokkos::shfl_down(sum, 1,blockDim.x); - if (blockDim.x > 2) - sum += Kokkos::shfl_down(sum, 2,blockDim.x); - if (blockDim.x > 4) - sum += Kokkos::shfl_down(sum, 4,blockDim.x); - if (blockDim.x > 8) - sum += Kokkos::shfl_down(sum, 8,blockDim.x); - if (blockDim.x > 16) - sum += Kokkos::shfl_down(sum, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (threadIdx.x==0) -#else - if (true) -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) + lsum += val * m_x(row.colidx(iEntry),0); + }); + Kokkos::single(Kokkos::PerThread(dev), + [&]() { if (doalpha == -1) { sum = -sum; @@ -778,7 +681,7 @@ struct SPMV_MV_LayoutLeft_Functor { } else { m_y(iRow, 0) = beta * m_y(iRow, 0) + sum; } - } + }); } @@ -800,99 +703,17 @@ struct SPMV_MV_LayoutLeft_Functor { // needs to have the same type as n. ordinal_type kk = 0; -#ifdef KOKKOS_FAST_COMPILE +//#ifdef KOKKOS_FAST_COMPILE for (; kk + 4 <= n; kk += 4) { strip_mine<4>(dev, iRow, kk); } for( ; kk < n; ++kk) { strip_mine<1>(dev, iRow, kk); } -#else -# ifdef __CUDA_ARCH__ - if ((n > 8) && (n % 8 == 1)) { - strip_mine<9>(dev, iRow, kk); - kk += 9; - } - for(; kk + 8 <= n; kk += 8) - strip_mine<8>(dev, iRow, kk); - if(kk < n) - switch(n - kk) { -# else // NOT a CUDA device - if ((n > 16) && (n % 16 == 1)) { - strip_mine<17>(dev, iRow, kk); - kk += 17; - } - - for (; kk + 16 <= n; kk += 16) { - strip_mine<16>(dev, iRow, kk); - } - - if(kk < n) - switch(n - kk) { - case 15: - strip_mine<15>(dev, iRow, kk); - break; - - case 14: - strip_mine<14>(dev, iRow, kk); - break; - - case 13: - strip_mine<13>(dev, iRow, kk); - break; - - case 12: - strip_mine<12>(dev, iRow, kk); - break; - - case 11: - strip_mine<11>(dev, iRow, kk); - break; - - case 10: - strip_mine<10>(dev, iRow, kk); - break; - - case 9: - strip_mine<9>(dev, iRow, kk); - break; - - case 8: - strip_mine<8>(dev, iRow, kk); - break; -# endif // __CUDA_ARCH__ - case 7: - strip_mine<7>(dev, iRow, kk); - break; - - case 6: - strip_mine<6>(dev, iRow, kk); - break; - - case 5: - strip_mine<5>(dev, iRow, kk); - break; - - case 4: - strip_mine<4>(dev, iRow, kk); - break; - - case 3: - strip_mine<3>(dev, iRow, kk); - break; - - case 2: - strip_mine<2>(dev, iRow, kk); - break; - - case 1: - strip_mine_1(dev, iRow); - break; - } -#endif // KOKKOS_FAST_COMPILE - } + //BMK: HERE } - }; + } +}; template OpType; - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); typename AMatrix::const_ordinal_type nrow = A.numRows(); @@ -957,7 +778,7 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, @@ -1115,7 +936,91 @@ spmv_alpha_mv (const char mode[], } } -} -} +}} //namespace KokkosSparse::Impl #endif // KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_ + /* +#else +# ifdef __CUDA_ARCH__ + if ((n > 8) && (n % 8 == 1)) { + strip_mine<9>(dev, iRow, kk); + kk += 9; + } + for(; kk + 8 <= n; kk += 8) + strip_mine<8>(dev, iRow, kk); + if(kk < n) { + switch(n - kk) { +# else // NOT a CUDA device + if ((n > 16) && (n % 16 == 1)) { + strip_mine<17>(dev, iRow, kk); + kk += 17; + } + + for (; kk + 16 <= n; kk += 16) { + strip_mine<16>(dev, iRow, kk); + } + + if(kk < n) { + switch(n - kk) { + case 15: + strip_mine<15>(dev, iRow, kk); + break; + + case 14: + strip_mine<14>(dev, iRow, kk); + break; + + case 13: + strip_mine<13>(dev, iRow, kk); + break; + + case 12: + strip_mine<12>(dev, iRow, kk); + break; + + case 11: + strip_mine<11>(dev, iRow, kk); + break; + + case 10: + strip_mine<10>(dev, iRow, kk); + break; + + case 9: + strip_mine<9>(dev, iRow, kk); + break; + + case 8: + strip_mine<8>(dev, iRow, kk); + break; +# endif // __CUDA_ARCH__ + case 7: + strip_mine<7>(dev, iRow, kk); + break; + + case 6: + strip_mine<6>(dev, iRow, kk); + break; + + case 5: + strip_mine<5>(dev, iRow, kk); + break; + + case 4: + strip_mine<4>(dev, iRow, kk); + break; + + case 3: + strip_mine<3>(dev, iRow, kk); + break; + + case 2: + strip_mine<2>(dev, iRow, kk); + break; + + case 1: + strip_mine_1(dev, iRow); + break; + } +#endif // KOKKOS_FAST_COMPILE + */ diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index a9c62806fd..3575f87dca 100644 --- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -46,6 +46,7 @@ #define KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_ #include "Kokkos_InnerProductSpaceTraits.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBlas1_scal.hpp" #include "KokkosSparse_CrsMatrix.hpp" @@ -91,12 +92,13 @@ struct SPMV_Struct_Transpose_Functor { KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - // This should be a thread loop as soon as we can use C++11 - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { + const ordinal_type teamWorkStart = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) * rows_per_thread; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread), + [&](ordinal_type loop) + { // iRow represents a row of the matrix, so its correct type is // ordinal_type. - const ordinal_type iRow = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread + loop; + ordinal_type iRow = teamWorkStart + loop; if (iRow >= m_A.numRows ()) { return; } @@ -104,15 +106,8 @@ struct SPMV_Struct_Transpose_Functor { const auto row = m_A.rowConst (iRow); const ordinal_type row_length = row.length; -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row_length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row_length; - iEntry ++) -#endif + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length), + [&](ordinal_type iEntry) { const value_type val = conjugate ? ATV::conj (row.value(iEntry)) : @@ -120,8 +115,8 @@ struct SPMV_Struct_Transpose_Functor { const ordinal_type ind = row.colidx(iEntry); Kokkos::atomic_add (&m_y(ind), static_cast (alpha * val * m_x(iRow))); - } - } + }); + }); } }; @@ -302,7 +297,7 @@ struct SPMV_Struct_Functor { }); dev.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team), [&] (const ordinal_type& loop) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, 0, rows_per_team),[&] (const ordinal_type& loop) { const ordinal_type interiorIdx = static_cast ( dev.league_rank() ) * rows_per_team + loop; if(interiorIdx >= numInterior) { return; } @@ -665,11 +660,9 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_ // Determine rows per thread if(rows_per_thread < 1) { - #ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) rows_per_thread = 1; else - #endif { if(nnz_per_row < 20 && numInterior*nnz_per_row > 5000000 ) { rows_per_thread = 256; @@ -678,14 +671,12 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_ } } - #ifdef KOKKOS_ENABLE_CUDA if(team_size < 1) { - if(std::is_same::value) + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { team_size = 128 / vector_length; } else { team_size = 1; } } - #endif rows_per_team = rows_per_thread * team_size; @@ -903,27 +894,19 @@ struct SPMV_MV_Struct_Transpose_Functor { operator() (const team_member& dev) const { // This should be a thread loop as soon as we can use C++11 - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { - // iRow represents a row of the matrix, so its correct type is - // ordinal_type. - const ordinal_type iRow = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread + loop; + const ordinal_type teamWorkStart = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) * rows_per_thread; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread), + [&](ordinal_type loop) + { + const ordinal_type iRow = teamWorkStart + loop; if (iRow >= m_A.numRows ()) { return; } - const auto row = m_A.rowConst (iRow); const ordinal_type row_length = row.length; -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < static_cast (row_length); - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row_length; - iEntry ++) -#endif + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row_length), + [&](ordinal_type iEntry) { const A_value_type val = conjugate ? Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : @@ -947,430 +930,251 @@ struct SPMV_MV_Struct_Transpose_Functor { static_cast (val * m_x(iRow, k))); } } - } - } + }); + }); } }; - template - struct SPMV_MV_Struct_LayoutLeft_Functor { - typedef typename AMatrix::execution_space execution_space; - typedef typename AMatrix::non_const_ordinal_type ordinal_type; - typedef typename AMatrix::non_const_value_type A_value_type; - typedef typename YVector::non_const_value_type y_value_type; - typedef typename Kokkos::TeamPolicy team_policy; - typedef typename team_policy::member_type team_member; - typedef typename YVector::non_const_value_type coefficient_type; - - const coefficient_type alpha; - AMatrix m_A; - XVector m_x; - const coefficient_type beta; - YVector m_y; - //! The number of columns in the input and output MultiVectors. - ordinal_type n; - ordinal_type rows_per_thread; - - SPMV_MV_Struct_LayoutLeft_Functor (const coefficient_type& alpha_, - const AMatrix& m_A_, - const XVector& m_x_, - const coefficient_type& beta_, - const YVector& m_y_, - const ordinal_type rows_per_thread_) : - alpha (alpha_), - m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)), - rows_per_thread (rows_per_thread_) - {} - - template - KOKKOS_INLINE_FUNCTION void - strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const - { - y_value_type sum[UNROLL]; - -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - sum[k] = Kokkos::Details::ArithTraits::zero (); - } +template +struct SPMV_MV_Struct_LayoutLeft_Functor { + typedef typename AMatrix::execution_space execution_space; + typedef typename AMatrix::non_const_ordinal_type ordinal_type; + typedef typename AMatrix::non_const_value_type A_value_type; + typedef typename YVector::non_const_value_type y_value_type; + typedef typename Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + typedef typename YVector::non_const_value_type coefficient_type; - const auto row = m_A.rowConst (iRow); + const coefficient_type alpha; + AMatrix m_A; + XVector m_x; + const coefficient_type beta; + YVector m_y; + //! The number of columns in the input and output MultiVectors. + ordinal_type n; + ordinal_type rows_per_thread; + int vector_length; + + SPMV_MV_Struct_LayoutLeft_Functor (const coefficient_type& alpha_, + const AMatrix& m_A_, + const XVector& m_x_, + const coefficient_type& beta_, + const YVector& m_y_, + const ordinal_type rows_per_thread_, + int vector_length_) : + alpha (alpha_), + m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)), + rows_per_thread (rows_per_thread_), vector_length(vector_length_) + {} - // The correct type of iEntry is ordinal_type, the type of the - // number of columns in the (local) matrix. This is because we - // assume either that rows have no duplicate entries, or that rows - // never have enough duplicate entries to overflow ordinal_type. + template + KOKKOS_INLINE_FUNCTION void + strip_mine (const team_member& dev, const ordinal_type& iRow, const ordinal_type& kk) const + { + y_value_type sum[UNROLL]; -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif -#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT -#pragma loop count (15) -#endif -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row.length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row.length; - iEntry ++) -#endif - { - const A_value_type val = conjugate ? - Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : - row.value(iEntry); - const ordinal_type ind = row.colidx(iEntry); + for (int k = 0; k < UNROLL; ++k) { + sum[k] = Kokkos::Details::ArithTraits::zero (); + } + + const auto row = m_A.rowConst (iRow); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length), + [&](ordinal_type iEntry) + { + const A_value_type val = conjugate ? + Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : + row.value(iEntry); + const ordinal_type ind = row.colidx(iEntry); #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - sum[k] += val * m_x(ind, kk + k); - } - } - - if (doalpha == -1) { - for (int ii=0; ii < UNROLL; ++ii) { - y_value_type sumt = sum[ii]; -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sumt += Kokkos::shfl_down(sumt, 1,blockDim.x); - if (blockDim.x > 2) - sumt += Kokkos::shfl_down(sumt, 2,blockDim.x); - if (blockDim.x > 4) - sumt += Kokkos::shfl_down(sumt, 4,blockDim.x); - if (blockDim.x > 8) - sumt += Kokkos::shfl_down(sumt, 8,blockDim.x); - if (blockDim.x > 16) - sumt += Kokkos::shfl_down(sumt, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - sum[ii] = -sumt; - } + for (int k = 0; k < UNROLL; ++k) { + sum[k] += val * m_x(ind, kk + k); + } + }); + + if (doalpha == -1) { + for (int ii=0; ii < UNROLL; ++ii) { + y_value_type sumt; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length), + [&](ordinal_type , y_value_type& lsum) + { + //in this context, sum[ii] is a partial sum ii on one of the vector lanes. + lsum -= sum[ii]; + }, sumt); + sum[ii] = sumt; + //that was an all-reduce, so sum[ii] is the same on every vector lane } - else { - for (int ii=0; ii < UNROLL; ++ii) { - y_value_type sumt = sum[ii]; -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sumt += Kokkos::shfl_down(sumt, 1,blockDim.x); - if (blockDim.x > 2) - sumt += Kokkos::shfl_down(sumt, 2,blockDim.x); - if (blockDim.x > 4) - sumt += Kokkos::shfl_down(sumt, 4,blockDim.x); - if (blockDim.x > 8) - sumt += Kokkos::shfl_down(sumt, 8,blockDim.x); - if (blockDim.x > 16) - sumt += Kokkos::shfl_down(sumt, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) + } + else { + for (int ii=0; ii < UNROLL; ++ii) { + y_value_type sumt; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, vector_length), + [&](ordinal_type, y_value_type& lsum) + { + //in this context, sum[ii] is a partial sum ii on one of the vector lanes. + lsum += sum[ii]; + }, sumt); + if(doalpha == 1) sum[ii] = sumt; - } + else + sum[ii] = sumt * alpha; } - -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (threadIdx.x==0) -#else - if (true) -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - { - if (doalpha * doalpha != 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - sum[k] *= alpha; - } - } - - if (dobeta == 0) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = sum[k]; - } - } else if (dobeta == 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) += sum[k]; - } - } else if (dobeta == -1) { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; - } - } else { -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; - } - } - } } - KOKKOS_INLINE_FUNCTION void - strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const + Kokkos::single(Kokkos::PerThread(dev), + [&]() { - y_value_type sum = Kokkos::Details::ArithTraits::zero (); - - const auto row = m_A.rowConst (iRow); + if (dobeta == 0) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = sum[k]; + }); + } else if (dobeta == 1) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) += sum[k]; + }); + } else if (dobeta == -1) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; + }); + } else { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), + [&](ordinal_type k) + { + m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; + }); + } + }); + } - // The correct type of iEntry is ordinal_type, the type of the - // number of columns in the (local) matrix. This is because we - // assume either that rows have no duplicate entries, or that rows - // never have enough duplicate entries to overflow ordinal_type. + KOKKOS_INLINE_FUNCTION void + strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const + { + const auto row = m_A.rowConst (iRow); -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT -#pragma loop count (15) -#endif -#ifdef __CUDA_ARCH__ - for (ordinal_type iEntry = static_cast (threadIdx.x); - iEntry < row.length; - iEntry += static_cast (blockDim.x)) -#else - for (ordinal_type iEntry = 0; - iEntry < row.length; - iEntry ++) -#endif - { - const A_value_type val = conjugate ? - Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : - row.value(iEntry); - sum += val * m_x(row.colidx(iEntry),0); - } -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (blockDim.x > 1) - sum += Kokkos::shfl_down(sum, 1,blockDim.x); - if (blockDim.x > 2) - sum += Kokkos::shfl_down(sum, 2,blockDim.x); - if (blockDim.x > 4) - sum += Kokkos::shfl_down(sum, 4,blockDim.x); - if (blockDim.x > 8) - sum += Kokkos::shfl_down(sum, 8,blockDim.x); - if (blockDim.x > 16) - sum += Kokkos::shfl_down(sum, 16,blockDim.x); -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - -#if defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - if (threadIdx.x==0) -#else - if (true) -#endif // defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) - { - if (doalpha == -1) { - sum = -sum; - } else if (doalpha * doalpha != 1) { - sum *= alpha; - } + y_value_type sum; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(dev, row.length), + [&](ordinal_type iEntry, y_value_type& lsum) + { + const A_value_type val = conjugate ? + Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : + row.value(iEntry); + lsum += val * m_x(row.colidx(iEntry),0); + }, sum); + + Kokkos::single(Kokkos::PerThread(dev), + [&]() + { + if (doalpha == -1) { + sum = -sum; + } else if (doalpha * doalpha != 1) { + sum *= alpha; + } - if (dobeta == 0) { - m_y(iRow, 0) = sum ; - } else if (dobeta == 1) { - m_y(iRow, 0) += sum ; - } else if (dobeta == -1) { - m_y(iRow, 0) = -m_y(iRow, 0) + sum; - } else { - m_y(iRow, 0) = beta * m_y(iRow, 0) + sum; - } - } - } + if (dobeta == 0) { + m_y(iRow, 0) = sum; + } else if (dobeta == 1) { + m_y(iRow, 0) += sum; + } else if (dobeta == -1) { + m_y(iRow, 0) = -m_y(iRow, 0) + sum; + } else { + m_y(iRow, 0) = beta * m_y(iRow, 0) + sum; + } + }); + } - KOKKOS_INLINE_FUNCTION void - operator() (const team_member& dev) const - { - for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { + KOKKOS_INLINE_FUNCTION void + operator() (const team_member& dev) const + { + for (ordinal_type loop = 0; loop < rows_per_thread; ++loop) { - // iRow indexes over (local) rows of the matrix, so its correct - // type is ordinal_type. + // iRow indexes over (local) rows of the matrix, so its correct + // type is ordinal_type. - const ordinal_type iRow = (dev.league_rank() * dev.team_size() + dev.team_rank()) - * rows_per_thread + loop; - if (iRow >= m_A.numRows ()) { - return; - } + const ordinal_type iRow = (dev.league_rank() * dev.team_size() + dev.team_rank()) + * rows_per_thread + loop; + if (iRow >= m_A.numRows ()) { + return; + } - // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it - // needs to have the same type as n. - ordinal_type kk = 0; + // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it + // needs to have the same type as n. + ordinal_type kk = 0; -#ifdef KOKKOS_FAST_COMPILE - for (; kk + 4 <= n; kk += 4) { - strip_mine<4>(dev, iRow, kk); - } - for( ; kk < n; ++kk) { - strip_mine<1>(dev, iRow, kk); - } -#else -# ifdef __CUDA_ARCH__ - if ((n > 8) && (n % 8 == 1)) { - strip_mine<9>(dev, iRow, kk); - kk += 9; - } - for(; kk + 8 <= n; kk += 8) - strip_mine<8>(dev, iRow, kk); - if(kk < n) - switch(n - kk) { -# else // NOT a CUDA device - if ((n > 16) && (n % 16 == 1)) { - strip_mine<17>(dev, iRow, kk); - kk += 17; - } +//#ifdef KOKKOS_FAST_COMPILE + for (; kk + 4 <= n; kk += 4) { + strip_mine<4>(dev, iRow, kk); + } + for( ; kk < n; ++kk) { + strip_mine<1>(dev, iRow, kk); + } + //BMK: HERE + } + } +}; - for (; kk + 16 <= n; kk += 16) { - strip_mine<16>(dev, iRow, kk); - } - if(kk < n) - switch(n - kk) { - case 15: - strip_mine<15>(dev, iRow, kk); - break; - - case 14: - strip_mine<14>(dev, iRow, kk); - break; - - case 13: - strip_mine<13>(dev, iRow, kk); - break; - - case 12: - strip_mine<12>(dev, iRow, kk); - break; - - case 11: - strip_mine<11>(dev, iRow, kk); - break; - - case 10: - strip_mine<10>(dev, iRow, kk); - break; - - case 9: - strip_mine<9>(dev, iRow, kk); - break; - - case 8: - strip_mine<8>(dev, iRow, kk); - break; -# endif // __CUDA_ARCH__ - case 7: - strip_mine<7>(dev, iRow, kk); - break; - - case 6: - strip_mine<6>(dev, iRow, kk); - break; - - case 5: - strip_mine<5>(dev, iRow, kk); - break; - - case 4: - strip_mine<4>(dev, iRow, kk); - break; - - case 3: - strip_mine<3>(dev, iRow, kk); - break; - - case 2: - strip_mine<2>(dev, iRow, kk); - break; - - case 1: - strip_mine_1(dev, iRow); - break; - } -#endif // KOKKOS_FAST_COMPILE - } - } - }; - - - template - static void - spmv_alpha_beta_mv_struct_no_transpose (const typename YVector::non_const_value_type& alpha, - const AMatrix& A, - const XVector& x, - const typename YVector::non_const_value_type& beta, - const YVector& y) - { - typedef typename AMatrix::ordinal_type ordinal_type; + template + static void + spmv_alpha_beta_mv_struct_no_transpose (const typename YVector::non_const_value_type& alpha, + const AMatrix& A, + const XVector& x, + const typename YVector::non_const_value_type& beta, + const YVector& y) + { + typedef typename AMatrix::ordinal_type ordinal_type; - if (A.numRows () <= static_cast (0)) { - return; - } - if (doalpha == 0) { - if (dobeta != 1) { - KokkosBlas::scal (y, beta, y); - } - return; + if (A.numRows () <= static_cast (0)) { + return; + } + if (doalpha == 0) { + if (dobeta != 1) { + KokkosBlas::scal (y, beta, y); } - else { - typedef typename AMatrix::size_type size_type; + return; + } + else { + typedef typename AMatrix::size_type size_type; - // Assuming that no row contains duplicate entries, NNZPerRow - // cannot be more than the number of columns of the matrix. Thus, - // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); + // Assuming that no row contains duplicate entries, NNZPerRow + // cannot be more than the number of columns of the matrix. Thus, + // the appropriate type is ordinal_type. + const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); - int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + int vector_length = 1; + while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels - typedef SPMV_MV_Struct_LayoutLeft_Functor OpType; - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + typedef SPMV_MV_Struct_LayoutLeft_Functor OpType; + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); - typename AMatrix::const_ordinal_type nrow = A.numRows(); + typename AMatrix::const_ordinal_type nrow = A.numRows(); +<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1382,16 +1186,34 @@ struct SPMV_MV_Struct_Transpose_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); +======= + // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here + // instead of int? For example, if the number of threads is 1, + // then this is just the number of rows. Ditto for rows_per_team. + // team_size is a hardware resource thing so it might legitimately + // be int. + const int rows_per_thread = RowsPerThread(NNZPerRow); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE + const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); +#else + const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); +#endif + const int rows_per_team = rows_per_thread * team_size; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); +>>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta - typedef SPMV_MV_Struct_LayoutLeft_Functor OpType; + typedef SPMV_MV_Struct_LayoutLeft_Functor OpType; - typename AMatrix::const_ordinal_type nrow = A.numRows(); + typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); +<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1403,55 +1225,73 @@ struct SPMV_MV_Struct_Transpose_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); +======= + // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here + // instead of int? For example, if the number of threads is 1, + // then this is just the number of rows. Ditto for rows_per_team. + // team_size is a hardware resource thing so it might legitimately + // be int. + const int rows_per_thread = RowsPerThread(NNZPerRow); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE + const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); +#else + const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); +#endif + const int rows_per_team = rows_per_thread * team_size; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); +>>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI #endif // KOKKOS_FAST_COMPILE - } } + } - template - static void - spmv_alpha_beta_mv_struct_transpose (const typename YVector::non_const_value_type& alpha, - const AMatrix& A, - const XVector& x, - const typename YVector::non_const_value_type& beta, - const YVector& y) - { - typedef typename AMatrix::ordinal_type ordinal_type; + template + static void + spmv_alpha_beta_mv_struct_transpose (const typename YVector::non_const_value_type& alpha, + const AMatrix& A, + const XVector& x, + const typename YVector::non_const_value_type& beta, + const YVector& y) + { + typedef typename AMatrix::ordinal_type ordinal_type; - if (A.numRows () <= static_cast (0)) { - return; - } + if (A.numRows () <= static_cast (0)) { + return; + } - // We need to scale y first ("scaling" by zero just means filling - // with zeros), since the functor works by atomic-adding into y. - if (dobeta != 1) { - KokkosBlas::scal (y, beta, y); - } + // We need to scale y first ("scaling" by zero just means filling + // with zeros), since the functor works by atomic-adding into y. + if (dobeta != 1) { + KokkosBlas::scal (y, beta, y); + } - if (doalpha != 0) { - typedef typename AMatrix::size_type size_type; + if (doalpha != 0) { + typedef typename AMatrix::size_type size_type; - // Assuming that no row contains duplicate entries, NNZPerRow - // cannot be more than the number of columns of the matrix. Thus, - // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); + // Assuming that no row contains duplicate entries, NNZPerRow + // cannot be more than the number of columns of the matrix. Thus, + // the appropriate type is ordinal_type. + const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); - int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + int vector_length = 1; + while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels - typedef SPMV_MV_Struct_Transpose_Functor OpType; - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + typedef SPMV_MV_Struct_Transpose_Functor OpType; + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); - typename AMatrix::const_ordinal_type nrow = A.numRows(); + typename AMatrix::const_ordinal_type nrow = A.numRows(); +<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1463,16 +1303,34 @@ struct SPMV_MV_Struct_Transpose_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for ("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); +======= + // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here + // instead of int? For example, if the number of threads is 1, + // then this is just the number of rows. Ditto for rows_per_team. + // team_size is a hardware resource thing so it might legitimately + // be int. + const int rows_per_thread = RowsPerThread(NNZPerRow); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE + const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); +#else + const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); +#endif + const int rows_per_team = rows_per_thread * team_size; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for ("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); +>>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta - typedef SPMV_MV_Struct_Transpose_Functor OpType; + typedef SPMV_MV_Struct_Transpose_Functor OpType; - typename AMatrix::const_ordinal_type nrow = A.numRows(); + typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); +<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1484,73 +1342,176 @@ struct SPMV_MV_Struct_Transpose_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); +======= + // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here + // instead of int? For example, if the number of threads is 1, + // then this is just the number of rows. Ditto for rows_per_team. + // team_size is a hardware resource thing so it might legitimately + // be int. + const int rows_per_thread = RowsPerThread(NNZPerRow); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE + const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); +#else + const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); +#endif + const int rows_per_team = rows_per_thread * team_size; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); +>>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI #endif // KOKKOS_FAST_COMPILE - } } + } - template - static void - spmv_alpha_beta_mv_struct (const char mode[], - const typename YVector::non_const_value_type& alpha, - const AMatrix& A, - const XVector& x, - const typename YVector::non_const_value_type& beta, - const YVector& y) - { - if (mode[0] == NoTranspose[0]) { - spmv_alpha_beta_mv_struct_no_transpose (alpha, A, x, beta, y); - } - else if (mode[0] == Conjugate[0]) { - spmv_alpha_beta_mv_struct_no_transpose (alpha, A, x, beta, y); - } - else if (mode[0] == Transpose[0]) { - spmv_alpha_beta_mv_struct_transpose (alpha, A, x, beta, y); - } - else if (mode[0] == ConjugateTranspose[0]) { - spmv_alpha_beta_mv_struct_transpose (alpha, A, x, beta, y); - } - else { - Kokkos::Impl::throw_runtime_exception ("Invalid Transpose Mode for KokkosSparse::spmv()"); - } + template + static void + spmv_alpha_beta_mv_struct (const char mode[], + const typename YVector::non_const_value_type& alpha, + const AMatrix& A, + const XVector& x, + const typename YVector::non_const_value_type& beta, + const YVector& y) + { + if (mode[0] == NoTranspose[0]) { + spmv_alpha_beta_mv_struct_no_transpose (alpha, A, x, beta, y); } - - template - void - spmv_alpha_mv_struct (const char mode[], - const typename YVector::non_const_value_type& alpha, - const AMatrix& A, - const XVector& x, - const typename YVector::non_const_value_type& beta, - const YVector& y) - { - typedef typename YVector::non_const_value_type coefficient_type; - typedef Kokkos::Details::ArithTraits KAT; - - if (beta == KAT::zero ()) { - spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); - } - else if (beta == KAT::one ()) { - spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); - } - else if (beta == -KAT::one ()) { - spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); - } - else { - spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); - } + else if (mode[0] == Conjugate[0]) { + spmv_alpha_beta_mv_struct_no_transpose (alpha, A, x, beta, y); } + else if (mode[0] == Transpose[0]) { + spmv_alpha_beta_mv_struct_transpose (alpha, A, x, beta, y); + } + else if (mode[0] == ConjugateTranspose[0]) { + spmv_alpha_beta_mv_struct_transpose (alpha, A, x, beta, y); + } + else { + Kokkos::Impl::throw_runtime_exception ("Invalid Transpose Mode for KokkosSparse::spmv()"); + } + } + template + void + spmv_alpha_mv_struct (const char mode[], + const typename YVector::non_const_value_type& alpha, + const AMatrix& A, + const XVector& x, + const typename YVector::non_const_value_type& beta, + const YVector& y) + { + typedef typename YVector::non_const_value_type coefficient_type; + typedef Kokkos::Details::ArithTraits KAT; + if (beta == KAT::zero ()) { + spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); + } + else if (beta == KAT::one ()) { + spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); + } + else if (beta == -KAT::one ()) { + spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); + } + else { + spmv_alpha_beta_mv_struct (mode, alpha, A, x, beta, y); + } + } } } #endif // KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_ + /* +#else +# ifdef __CUDA_ARCH__ + if ((n > 8) && (n % 8 == 1)) { + strip_mine<9>(dev, iRow, kk); + kk += 9; + } + for(; kk + 8 <= n; kk += 8) + strip_mine<8>(dev, iRow, kk); + if(kk < n) + { + switch(n - kk) { +# else // NOT a CUDA device + if ((n > 16) && (n % 16 == 1)) { + strip_mine<17>(dev, iRow, kk); + kk += 17; + } + + for (; kk + 16 <= n; kk += 16) { + strip_mine<16>(dev, iRow, kk); + } + + if(kk < n) + { + switch(n - kk) { + case 15: + strip_mine<15>(dev, iRow, kk); + break; + + case 14: + strip_mine<14>(dev, iRow, kk); + break; + + case 13: + strip_mine<13>(dev, iRow, kk); + break; + + case 12: + strip_mine<12>(dev, iRow, kk); + break; + + case 11: + strip_mine<11>(dev, iRow, kk); + break; + + case 10: + strip_mine<10>(dev, iRow, kk); + break; + + case 9: + strip_mine<9>(dev, iRow, kk); + break; + + case 8: + strip_mine<8>(dev, iRow, kk); + break; + #endif // __CUDA_ARCH__ + case 7: + strip_mine<7>(dev, iRow, kk); + break; + + case 6: + strip_mine<6>(dev, iRow, kk); + break; + + case 5: + strip_mine<5>(dev, iRow, kk); + break; + + case 4: + strip_mine<4>(dev, iRow, kk); + break; + + case 3: + strip_mine<3>(dev, iRow, kk); + break; + + case 2: + strip_mine<2>(dev, iRow, kk); + break; + + case 1: + strip_mine_1(dev, iRow); + break; + } + } +#endif // KOKKOS_FAST_COMPILE + */ diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index a9ffcd282a..271d8b2396 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2464,6 +2464,23 @@ struct ReturnRangePolicyType { } }; #endif +#ifdef KOKKOS_ENABLE_HIP +template <> +struct ReturnRangePolicyType { + using PolicyType = Kokkos::RangePolicy; + + static inline + PolicyType get_policy(int nt, int ts) { + return PolicyType(nt,ts); + } + + template + static inline + PolicyType get_policy(int nt, int ts, ExecInstanceType stream) { + return PolicyType(stream,nt,ts); + } +}; +#endif template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType > void lower_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) { diff --git a/test_common/KokkosKernels_TestParameters.hpp b/test_common/KokkosKernels_TestParameters.hpp index 295b46df9b..c069c618e6 100644 --- a/test_common/KokkosKernels_TestParameters.hpp +++ b/test_common/KokkosKernels_TestParameters.hpp @@ -72,6 +72,7 @@ struct Parameters{ int use_threads; int use_openmp; int use_cuda; + int use_hip; int use_serial; int a_mem_space, b_mem_space, c_mem_space, work_mem_space; @@ -121,6 +122,7 @@ struct Parameters{ use_threads = 0; use_openmp = 0; use_cuda = 0; + use_hip = 0; use_serial = 0; a_mem_space = b_mem_space = c_mem_space = work_mem_space = 1; a_mtx_bin_file = b_mtx_bin_file = c_mtx_bin_file = NULL; From 27e0a29071da9fd153e07a88c6a87244f127bb2f Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 9 Oct 2020 12:56:39 -0600 Subject: [PATCH 02/18] Fixed spmv for OpenMP --- src/common/KokkosKernels_SparseUtils.hpp | 2 - .../impl/KokkosSparse_gauss_seidel_impl.hpp | 1 - .../impl/KokkosSparse_spgemm_impl_def.hpp | 1 - src/sparse/impl/KokkosSparse_spmv_impl.hpp | 182 +++++++++--------- .../impl/KokkosSparse_spmv_struct_impl.hpp | 177 +++++++++-------- 5 files changed, 178 insertions(+), 185 deletions(-) diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp index 7628e6de31..02ab3a50b7 100644 --- a/src/common/KokkosKernels_SparseUtils.hpp +++ b/src/common/KokkosKernels_SparseUtils.hpp @@ -1341,8 +1341,6 @@ void kk_sort_graph( out_nnz_view_t out_adj, out_scalar_view_t out_vals){ - ExecSpaceType exec = kk_get_exec_space_type(); - // If possible, sort on host and avoid a deep copy // TODO BMK: can this function be deprecated? typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj); diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index d956ed8d4d..d5c111862f 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -826,7 +826,6 @@ namespace KokkosSparse{ nnz_lno_t num_values_in_l2 = 0; nnz_lno_t num_big_rows = 0; - KokkosKernels::Impl::ExecSpaceType ex_sp = this->handle->get_handle_exec_space(); if (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { //again, if it is on CPUs, we make L1 as big as we need. size_t l1mem = 1; diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp index 4924e11b0c..8fdf276e61 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_def.hpp @@ -121,7 +121,6 @@ void KokkosSPGEMM //number of rows and nnzs nnz_lno_t n = this->row_mapB.extent(0) - 1; size_type nnz = this->entriesB.extent(0); - KokkosKernels::Impl::ExecSpaceType my_exec_space_ = KokkosKernels::Impl::get_exec_space_type(); bool compress_in_single_step = this->handle->get_spgemm_handle()->get_compression_step(); //compress in single step if it is GPU. diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 3389577497..4645a08b63 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -366,7 +366,8 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2; + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) + while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2; typedef SPMV_Transpose_Functor OpType; @@ -627,7 +628,7 @@ struct SPMV_MV_LayoutLeft_Functor { Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), [&](ordinal_type k) { - m_y(iRow, kk + k) = sum[k]; + m_y(iRow, kk + k) = m_y(iRow, kk + k) + sum[k]; }); } else if (dobeta == -1) { Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, UNROLL), @@ -662,7 +663,7 @@ struct SPMV_MV_LayoutLeft_Functor { Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : row.value(iEntry); lsum += val * m_x(row.colidx(iEntry),0); - }); + }, sum); Kokkos::single(Kokkos::PerThread(dev), [&]() { @@ -703,14 +704,97 @@ struct SPMV_MV_LayoutLeft_Functor { // needs to have the same type as n. ordinal_type kk = 0; -//#ifdef KOKKOS_FAST_COMPILE +#ifdef KOKKOS_FAST_COMPILE for (; kk + 4 <= n; kk += 4) { strip_mine<4>(dev, iRow, kk); } for( ; kk < n; ++kk) { strip_mine<1>(dev, iRow, kk); } - //BMK: HERE +#else +# ifdef __CUDA_ARCH__ + if ((n > 8) && (n % 8 == 1)) { + strip_mine<9>(dev, iRow, kk); + kk += 9; + } + for(; kk + 8 <= n; kk += 8) + strip_mine<8>(dev, iRow, kk); + if(kk < n) { + switch(n - kk) { +# else // NOT a CUDA device + if ((n > 16) && (n % 16 == 1)) { + strip_mine<17>(dev, iRow, kk); + kk += 17; + } + + for (; kk + 16 <= n; kk += 16) { + strip_mine<16>(dev, iRow, kk); + } + + if(kk < n) { + switch(n - kk) { + case 15: + strip_mine<15>(dev, iRow, kk); + break; + + case 14: + strip_mine<14>(dev, iRow, kk); + break; + + case 13: + strip_mine<13>(dev, iRow, kk); + break; + + case 12: + strip_mine<12>(dev, iRow, kk); + break; + + case 11: + strip_mine<11>(dev, iRow, kk); + break; + + case 10: + strip_mine<10>(dev, iRow, kk); + break; + + case 9: + strip_mine<9>(dev, iRow, kk); + break; + + case 8: + strip_mine<8>(dev, iRow, kk); + break; +# endif // __CUDA_ARCH__ + case 7: + strip_mine<7>(dev, iRow, kk); + break; + + case 6: + strip_mine<6>(dev, iRow, kk); + break; + + case 5: + strip_mine<5>(dev, iRow, kk); + break; + + case 4: + strip_mine<4>(dev, iRow, kk); + break; + + case 3: + strip_mine<3>(dev, iRow, kk); + break; + + case 2: + strip_mine<2>(dev, iRow, kk); + break; + + case 1: + strip_mine_1(dev, iRow); + break; + } + } +#endif // KOKKOS_FAST_COMPILE } } }; @@ -749,7 +833,8 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) + while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels @@ -939,88 +1024,3 @@ spmv_alpha_mv (const char mode[], }} //namespace KokkosSparse::Impl #endif // KOKKOSSPARSE_IMPL_SPMV_DEF_HPP_ - /* -#else -# ifdef __CUDA_ARCH__ - if ((n > 8) && (n % 8 == 1)) { - strip_mine<9>(dev, iRow, kk); - kk += 9; - } - for(; kk + 8 <= n; kk += 8) - strip_mine<8>(dev, iRow, kk); - if(kk < n) { - switch(n - kk) { -# else // NOT a CUDA device - if ((n > 16) && (n % 16 == 1)) { - strip_mine<17>(dev, iRow, kk); - kk += 17; - } - - for (; kk + 16 <= n; kk += 16) { - strip_mine<16>(dev, iRow, kk); - } - - if(kk < n) { - switch(n - kk) { - case 15: - strip_mine<15>(dev, iRow, kk); - break; - - case 14: - strip_mine<14>(dev, iRow, kk); - break; - - case 13: - strip_mine<13>(dev, iRow, kk); - break; - - case 12: - strip_mine<12>(dev, iRow, kk); - break; - - case 11: - strip_mine<11>(dev, iRow, kk); - break; - - case 10: - strip_mine<10>(dev, iRow, kk); - break; - - case 9: - strip_mine<9>(dev, iRow, kk); - break; - - case 8: - strip_mine<8>(dev, iRow, kk); - break; -# endif // __CUDA_ARCH__ - case 7: - strip_mine<7>(dev, iRow, kk); - break; - - case 6: - strip_mine<6>(dev, iRow, kk); - break; - - case 5: - strip_mine<5>(dev, iRow, kk); - break; - - case 4: - strip_mine<4>(dev, iRow, kk); - break; - - case 3: - strip_mine<3>(dev, iRow, kk); - break; - - case 2: - strip_mine<2>(dev, iRow, kk); - break; - - case 1: - strip_mine_1(dev, iRow); - break; - } -#endif // KOKKOS_FAST_COMPILE - */ diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index 3575f87dca..f4fa9ea1cd 100644 --- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -1118,14 +1118,99 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { // needs to have the same type as n. ordinal_type kk = 0; -//#ifdef KOKKOS_FAST_COMPILE +#ifdef KOKKOS_FAST_COMPILE for (; kk + 4 <= n; kk += 4) { strip_mine<4>(dev, iRow, kk); } for( ; kk < n; ++kk) { strip_mine<1>(dev, iRow, kk); } - //BMK: HERE +#else +# ifdef __CUDA_ARCH__ + if ((n > 8) && (n % 8 == 1)) { + strip_mine<9>(dev, iRow, kk); + kk += 9; + } + for(; kk + 8 <= n; kk += 8) + strip_mine<8>(dev, iRow, kk); + if(kk < n) + { + switch(n - kk) { +# else // NOT a CUDA device + if ((n > 16) && (n % 16 == 1)) { + strip_mine<17>(dev, iRow, kk); + kk += 17; + } + + for (; kk + 16 <= n; kk += 16) { + strip_mine<16>(dev, iRow, kk); + } + + if(kk < n) + { + switch(n - kk) { + case 15: + strip_mine<15>(dev, iRow, kk); + break; + + case 14: + strip_mine<14>(dev, iRow, kk); + break; + + case 13: + strip_mine<13>(dev, iRow, kk); + break; + + case 12: + strip_mine<12>(dev, iRow, kk); + break; + + case 11: + strip_mine<11>(dev, iRow, kk); + break; + + case 10: + strip_mine<10>(dev, iRow, kk); + break; + + case 9: + strip_mine<9>(dev, iRow, kk); + break; + + case 8: + strip_mine<8>(dev, iRow, kk); + break; + #endif // __CUDA_ARCH__ + case 7: + strip_mine<7>(dev, iRow, kk); + break; + + case 6: + strip_mine<6>(dev, iRow, kk); + break; + + case 5: + strip_mine<5>(dev, iRow, kk); + break; + + case 4: + strip_mine<4>(dev, iRow, kk); + break; + + case 3: + strip_mine<3>(dev, iRow, kk); + break; + + case 2: + strip_mine<2>(dev, iRow, kk); + break; + + case 1: + strip_mine_1(dev, iRow); + break; + } + } +#endif // KOKKOS_FAST_COMPILE } } }; @@ -1427,91 +1512,3 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { } #endif // KOKKOSSPARSE_IMPL_SPMV_STRUCT_DEF_HPP_ - /* -#else -# ifdef __CUDA_ARCH__ - if ((n > 8) && (n % 8 == 1)) { - strip_mine<9>(dev, iRow, kk); - kk += 9; - } - for(; kk + 8 <= n; kk += 8) - strip_mine<8>(dev, iRow, kk); - if(kk < n) - { - switch(n - kk) { -# else // NOT a CUDA device - if ((n > 16) && (n % 16 == 1)) { - strip_mine<17>(dev, iRow, kk); - kk += 17; - } - - for (; kk + 16 <= n; kk += 16) { - strip_mine<16>(dev, iRow, kk); - } - - if(kk < n) - { - switch(n - kk) { - case 15: - strip_mine<15>(dev, iRow, kk); - break; - - case 14: - strip_mine<14>(dev, iRow, kk); - break; - - case 13: - strip_mine<13>(dev, iRow, kk); - break; - - case 12: - strip_mine<12>(dev, iRow, kk); - break; - - case 11: - strip_mine<11>(dev, iRow, kk); - break; - - case 10: - strip_mine<10>(dev, iRow, kk); - break; - - case 9: - strip_mine<9>(dev, iRow, kk); - break; - - case 8: - strip_mine<8>(dev, iRow, kk); - break; - #endif // __CUDA_ARCH__ - case 7: - strip_mine<7>(dev, iRow, kk); - break; - - case 6: - strip_mine<6>(dev, iRow, kk); - break; - - case 5: - strip_mine<5>(dev, iRow, kk); - break; - - case 4: - strip_mine<4>(dev, iRow, kk); - break; - - case 3: - strip_mine<3>(dev, iRow, kk); - break; - - case 2: - strip_mine<2>(dev, iRow, kk); - break; - - case 1: - strip_mine_1(dev, iRow); - break; - } - } -#endif // KOKKOS_FAST_COMPILE - */ From f993534289950c566205e25fded0236e80c379d6 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 9 Oct 2020 13:06:41 -0600 Subject: [PATCH 03/18] Removed #pragma unroll Used to be a normal for loop, now it's a ThreadVectorRange --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 4645a08b63..1d2f737fa6 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -565,15 +565,6 @@ struct SPMV_MV_LayoutLeft_Functor { // assume either that rows have no duplicate entries, or that rows // never have enough duplicate entries to overflow ordinal_type. -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif -#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT -#pragma loop count (15) -#endif Kokkos::parallel_for(Kokkos::ThreadVectorRange(dev, row.length), [&](ordinal_type iEntry) { From 5e0b1191d97e43902b12205a71b9f49958c3ef32 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 12 Oct 2020 10:18:54 -0600 Subject: [PATCH 04/18] Update for deprecated removal --- .../impl/KokkosSparse_spmv_struct_impl.hpp | 73 ------------------- 1 file changed, 73 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index f4fa9ea1cd..be563c5257 100644 --- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -1259,7 +1259,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { typename AMatrix::const_ordinal_type nrow = A.numRows(); -<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1271,23 +1270,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); -======= - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); ->>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta @@ -1298,7 +1280,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); -<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1310,24 +1291,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); -======= - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); ->>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI - #endif // KOKKOS_FAST_COMPILE } } @@ -1376,7 +1339,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { typename AMatrix::const_ordinal_type nrow = A.numRows(); -<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1388,23 +1350,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for ("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); -======= - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for ("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); ->>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta @@ -1415,7 +1360,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); -<<<<<<< c079fe8700c4ab2b00626d70c76e4aa46f828acb // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, // then this is just the number of rows. Ditto for rows_per_team. @@ -1427,23 +1371,6 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); -======= - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE - const int team_size = Kokkos::TeamPolicy< typename AMatrix::execution_space >::team_size_recommended(op,vector_length); -#else - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); -#endif - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv_struct", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); ->>>>>>> WIP: adding HIP codepaths in preparation for tests/ETI #endif // KOKKOS_FAST_COMPILE } From 5315d0f9b0a478f9e799f1195709134668918025 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 13 Oct 2020 18:18:25 -0700 Subject: [PATCH 05/18] Fix SpMV transpose functors --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 52 ++++++++++------------ 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 1d2f737fa6..86b342647b 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -81,7 +81,6 @@ struct GetCoeffView,DeviceType> { template struct SPMV_Transpose_Functor { typedef typename AMatrix::execution_space execution_space; @@ -96,32 +95,26 @@ struct SPMV_Transpose_Functor { const coefficient_type alpha; AMatrix m_A; XVector m_x; - const coefficient_type beta; YVector m_y; - const ordinal_type rows_per_thread; + ordinal_type rows_per_team; SPMV_Transpose_Functor (const coefficient_type& alpha_, const AMatrix& m_A_, const XVector& m_x_, - const coefficient_type& beta_, - const YVector& m_y_, - const ordinal_type rows_per_thread_) : - alpha (alpha_), m_A (m_A_), m_x (m_x_), - beta (beta_), m_y (m_y_), - rows_per_thread (rows_per_thread_) + const YVector& m_y_) : + alpha (alpha_), m_A (m_A_), m_x (m_x_), m_y (m_y_) {} KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - const ordinal_type threadWork = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread; - Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread), + const ordinal_type teamWork = dev.league_rank() * rows_per_team; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_team), [&](ordinal_type loop) { // iRow represents a row of the matrix, so its correct type is // ordinal_type. - const ordinal_type iRow = threadWork + loop; + const ordinal_type iRow = teamWork + loop; if (iRow >= m_A.numRows ()) { return; } @@ -366,18 +359,18 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); int vector_length = 1; - if(KokkosKernels::Impl::kk_is_gpu_exec_space()) - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2; + while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2; - typedef SPMV_Transpose_Functor OpType; + typedef SPMV_Transpose_Functor OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, y); const int rows_per_thread = RowsPerThread (NNZPerRow); const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); @@ -444,30 +437,27 @@ struct SPMV_MV_Transpose_Functor { YVector m_y; const ordinal_type n; - const ordinal_type rows_per_thread; + ordinal_type rows_per_team; SPMV_MV_Transpose_Functor (const coefficient_type& alpha_, const AMatrix& m_A_, const XVector& m_x_, const coefficient_type& beta_, - const YVector& m_y_, - const ordinal_type rows_per_thread_) : + const YVector& m_y_) : alpha (alpha_), - m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)), - rows_per_thread (rows_per_thread_) + m_A (m_A_), m_x (m_x_), beta (beta_), m_y (m_y_), n (m_x_.extent(1)) {} KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - const ordinal_type threadWork = (static_cast (dev.league_rank() * dev.team_size() + dev.team_rank())) - * rows_per_thread; - Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_thread), + const ordinal_type teamWork = dev.league_rank() * rows_per_team; + Kokkos::parallel_for(Kokkos::TeamThreadRange(dev, rows_per_team), [&](ordinal_type loop) { // iRow represents a row of the matrix, so its correct type is // ordinal_type. - const ordinal_type iRow = threadWork + loop; + const ordinal_type iRow = teamWork + loop; if (iRow >= m_A.numRows ()) { return; } @@ -906,13 +896,15 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + //Transpose functor uses atomics which can't be vectorized on CPU + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) + while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels typedef SPMV_MV_Transpose_Functor OpType; - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y); typename AMatrix::const_ordinal_type nrow = A.numRows(); @@ -924,6 +916,7 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph const int rows_per_thread = RowsPerThread(NNZPerRow); const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for ("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); @@ -935,7 +928,7 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph typename AMatrix::const_ordinal_type nrow = A.numRows(); - OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow)); + OpType op (alpha, A, x, beta, y); // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here // instead of int? For example, if the number of threads is 1, @@ -945,6 +938,7 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph const int rows_per_thread = RowsPerThread(NNZPerRow); const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); const int rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > ( nteams , team_size , vector_length ) , op ); From da542886d32819e9b02c0e684a7f1b23c2ce0ce5 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 14 Oct 2020 10:42:52 -0700 Subject: [PATCH 06/18] Add back D1 default algorithm verbose output --- src/graph/KokkosGraph_Distance1ColorHandle.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp index e85412abb6..503c6c9310 100644 --- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp @@ -237,9 +237,19 @@ class GraphColoringHandle { auto exec = KokkosKernels::Impl::kk_get_exec_space_type(); if(exec == KokkosKernels::Impl::Exec_SERIAL) + { this->coloring_algorithm_type = COLORING_SERIAL; +#ifdef VERBOSE + std:cout << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n"; +#endif + } else + { this->coloring_algorithm_type = COLORING_VBBIT; +#ifdef VERBOSE + std:cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VBBIT\n"; +#endif + } } template From b5349f110f27f96f2d4261fdeec8b6072c644ea1 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 14 Oct 2020 10:43:21 -0700 Subject: [PATCH 07/18] Fix HIP device code macros It's __HIP_DEVICE_COMPILE__, not __CUDA_ARCH__. --- perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp | 2 +- .../do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp | 2 +- .../do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp | 2 +- .../do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp | 2 +- .../do-not-use/KokkosBatched_Test_LU_Host_Real.cpp | 2 +- .../do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp | 2 +- src/batched/KokkosBatched_Vector_SIMD.hpp | 4 ++-- src/batched/KokkosBatched_Vector_SIMD_Arith.hpp | 8 ++++---- src/blas/impl/KokkosBlas3_gemm_impl.hpp | 8 ++++---- src/common/KokkosKernels_BitUtils.hpp | 5 ++--- src/common/KokkosKernels_SparseUtils.hpp | 1 - src/sparse/impl/KokkosSparse_spmv_impl.hpp | 6 +++--- src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp | 4 ++-- unit_test/batched/Test_Batched_SerialTrmm.hpp | 2 +- unit_test/batched/Test_Batched_SerialTrtri.hpp | 2 +- unit_test/blas/Test_Blas3_gemm.hpp | 2 +- unit_test/blas/Test_Blas3_trmm.hpp | 2 +- unit_test/blas/Test_Blas3_trsm.hpp | 3 ++- unit_test/blas/Test_Blas_trtri.hpp | 2 +- 19 files changed, 30 insertions(+), 31 deletions(-) diff --git a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp index f37c2d1b6f..ac8abb18f7 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp @@ -51,7 +51,7 @@ using namespace KokkosBatched; int main (int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; const bool detail = false; diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp index adff41c48b..2fffa06855 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp @@ -29,7 +29,7 @@ int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; int N[1] = { 128*128 }; diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp index 7bb2a2907c..031909d540 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp @@ -27,7 +27,7 @@ void run(const int N) { int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; int N[1] = { 128*128 }; diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp index 8468800ee6..56ade7a446 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp @@ -27,7 +27,7 @@ void run(const int N) { int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; const int N[1] = { 128*128 }; diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp index 7b39c624f2..7d352283c6 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp @@ -21,7 +21,7 @@ int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); -#if !defined(__CUDA_ARCH__) +#if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int N = 128*128; for (int i=1;i #include -#if defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) #undef __KOKKOSBATCHED_ENABLE_AVX__ #else // compiler bug with AVX in some architectures @@ -129,7 +129,7 @@ namespace KokkosBatched { } -#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) namespace KokkosBatched { template<> diff --git a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp index 43ddbb101b..49317ca9d4 100644 --- a/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp +++ b/src/batched/KokkosBatched_Vector_SIMD_Arith.hpp @@ -77,7 +77,7 @@ namespace KokkosBatched { return r_val; } -#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -298,7 +298,7 @@ namespace KokkosBatched { return r_val; } -#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -568,7 +568,7 @@ namespace KokkosBatched { return r_val; } -#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) @@ -858,7 +858,7 @@ namespace KokkosBatched { return r_val; } -#if defined(__CUDA_ARCH__) && (defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)) +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(float,2) diff --git a/src/blas/impl/KokkosBlas3_gemm_impl.hpp b/src/blas/impl/KokkosBlas3_gemm_impl.hpp index 2e50a0064c..fc5ba4dfa6 100644 --- a/src/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/src/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -64,20 +64,20 @@ namespace Impl { // On GPUs it is more important to not jump around in global memory, i.e. have coallesced loads template struct impl_gemm_choose_copy_layout { - typedef LayoutAScratch type; + using type = LayoutAScratch; }; #ifdef KOKKOS_ENABLE_CUDA template struct impl_gemm_choose_copy_layout { - typedef LayoutA type; + using type = LayoutA; }; #endif #ifdef KOKKOS_ENABLE_HIP template struct impl_gemm_choose_copy_layout { - typedef LayoutA type; + using type = LayoutA; }; #endif @@ -399,7 +399,7 @@ KOKKOS_INLINE_FUNCTION void impl_team_gemm_block(const TeamHandle& team, const ViewTypeC& C, const ViewTypeA& A, const ViewTypeB& B) { typedef typename ViewTypeC::non_const_value_type ScalarC; // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) || !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && (!defined(__CUDA_ARCH__) || !defined(__HIP_DEVICE_COMPILE__)) int blockA0 = A.extent_int(0); int blockA1 = A.extent_int(1); int blockB1 = B.extent_int(1); diff --git a/src/common/KokkosKernels_BitUtils.hpp b/src/common/KokkosKernels_BitUtils.hpp index 28b2a01389..4d09fb964e 100644 --- a/src/common/KokkosKernels_BitUtils.hpp +++ b/src/common/KokkosKernels_BitUtils.hpp @@ -51,8 +51,7 @@ namespace KokkosKernels{ namespace Impl{ // POP COUNT function returns the number of set bits -// Note BMK: HIP also defines __CUDA_ARCH__, and provides the same intrinsics. -#if defined( __CUDA_ARCH__ ) +#if defined( __CUDA_ARCH__ ) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned i ){ return __popc(i); @@ -182,7 +181,7 @@ int pop_count( long long i ){ // least_set_bit function returns the position of right most set bit -#if defined( __CUDA_ARCH__ ) +#if defined( __CUDA_ARCH__ ) || defined(__HIP_DEVICE_COMPILE__) KOKKOS_FORCEINLINE_FUNCTION int least_set_bit( unsigned i ){ return __ffs(i); diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp index 02ab3a50b7..6979f15847 100644 --- a/src/common/KokkosKernels_SparseUtils.hpp +++ b/src/common/KokkosKernels_SparseUtils.hpp @@ -1341,7 +1341,6 @@ void kk_sort_graph( out_nnz_view_t out_adj, out_scalar_view_t out_vals){ - // If possible, sort on host and avoid a deep copy // TODO BMK: can this function be deprecated? typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj); Kokkos::deep_copy (hr, in_xadj); diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 86b342647b..1c011e42d9 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -693,7 +693,7 @@ struct SPMV_MV_LayoutLeft_Functor { strip_mine<1>(dev, iRow, kk); } #else -# ifdef __CUDA_ARCH__ +# if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) if ((n > 8) && (n % 8 == 1)) { strip_mine<9>(dev, iRow, kk); kk += 9; @@ -702,7 +702,7 @@ struct SPMV_MV_LayoutLeft_Functor { strip_mine<8>(dev, iRow, kk); if(kk < n) { switch(n - kk) { -# else // NOT a CUDA device +# else // NOT a GPU if ((n > 16) && (n % 16 == 1)) { strip_mine<17>(dev, iRow, kk); kk += 17; @@ -745,7 +745,7 @@ struct SPMV_MV_LayoutLeft_Functor { case 8: strip_mine<8>(dev, iRow, kk); break; -# endif // __CUDA_ARCH__ +# endif // if/else: __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__ case 7: strip_mine<7>(dev, iRow, kk); break; diff --git a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index be563c5257..3179a0cc31 100644 --- a/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -1126,7 +1126,7 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { strip_mine<1>(dev, iRow, kk); } #else -# ifdef __CUDA_ARCH__ +# if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) if ((n > 8) && (n % 8 == 1)) { strip_mine<9>(dev, iRow, kk); kk += 9; @@ -1180,7 +1180,7 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { case 8: strip_mine<8>(dev, iRow, kk); break; - #endif // __CUDA_ARCH__ + #endif // __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__ case 7: strip_mine<7>(dev, iRow, kk); break; diff --git a/unit_test/batched/Test_Batched_SerialTrmm.hpp b/unit_test/batched/Test_Batched_SerialTrmm.hpp index 8f8fd48758..3301f3cd42 100644 --- a/unit_test/batched/Test_Batched_SerialTrmm.hpp +++ b/unit_test/batched/Test_Batched_SerialTrmm.hpp @@ -54,7 +54,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/batched/Test_Batched_SerialTrtri.hpp b/unit_test/batched/Test_Batched_SerialTrtri.hpp index c50e26ae35..f4f74d6b7c 100644 --- a/unit_test/batched/Test_Batched_SerialTrtri.hpp +++ b/unit_test/batched/Test_Batched_SerialTrtri.hpp @@ -56,7 +56,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/blas/Test_Blas3_gemm.hpp b/unit_test/blas/Test_Blas3_gemm.hpp index 55c71231f6..451b7fedac 100644 --- a/unit_test/blas/Test_Blas3_gemm.hpp +++ b/unit_test/blas/Test_Blas3_gemm.hpp @@ -25,7 +25,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/blas/Test_Blas3_trmm.hpp b/unit_test/blas/Test_Blas3_trmm.hpp index 74fd49b988..9f72bd5e63 100644 --- a/unit_test/blas/Test_Blas3_trmm.hpp +++ b/unit_test/blas/Test_Blas3_trmm.hpp @@ -49,7 +49,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/blas/Test_Blas3_trsm.hpp b/unit_test/blas/Test_Blas3_trsm.hpp index e6e98723c2..8fec44b637 100644 --- a/unit_test/blas/Test_Blas3_trsm.hpp +++ b/unit_test/blas/Test_Blas3_trsm.hpp @@ -49,7 +49,8 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + int i = team.league_rank(); #else const int i = team.league_rank(); diff --git a/unit_test/blas/Test_Blas_trtri.hpp b/unit_test/blas/Test_Blas_trtri.hpp index f939b87b31..bcc6b842c8 100644 --- a/unit_test/blas/Test_Blas_trtri.hpp +++ b/unit_test/blas/Test_Blas_trtri.hpp @@ -49,7 +49,7 @@ namespace Test { KOKKOS_INLINE_FUNCTION void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { // GNU COMPILER BUG WORKAROUND -#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) +#if defined(KOKKOS_COMPILER_GNU) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) int i = team.league_rank(); #else const int i = team.league_rank(); From 9a9ec3443b8be533f35d9baf4ca865f1ae9e7741 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 14 Oct 2020 10:52:53 -0700 Subject: [PATCH 08/18] Restore d2 coloring verbose about default algo --- src/graph/KokkosGraph_Distance2ColorHandle.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/graph/KokkosGraph_Distance2ColorHandle.hpp b/src/graph/KokkosGraph_Distance2ColorHandle.hpp index 4c392051fb..39d66b744f 100644 --- a/src/graph/KokkosGraph_Distance2ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance2ColorHandle.hpp @@ -206,9 +206,19 @@ class GraphColorDistance2Handle void choose_default_algorithm() { if(KokkosKernels::Impl::kk_get_exec_space_type() == KokkosKernels::Impl::Exec_SERIAL) + { this->coloring_algorithm_type = COLORING_D2_SERIAL; +#ifdef VERBOSE + std:cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n"; +#endif + } else + { this->coloring_algorithm_type = COLORING_D2_NB_BIT; +#ifdef VERBOSE + std:cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n"; +#endif + } } From 2c3e3a46750f21d7d1b268e47645da9089b2ca54 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 14 Oct 2020 11:01:38 -0700 Subject: [PATCH 09/18] Fix indent --- src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp index 29dbb5c477..ec0c2034a2 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp @@ -1508,7 +1508,7 @@ void KokkosSPGEMM current_spgemm_algorithm = SPGEMM_KK_MEMORY; } maxNumRoughNonzeros = KOKKOSKERNELS_MACRO_MIN(this->b_col_cnt, maxNumRoughNonzeros); - int shmem_size_to_use = shmem_size; + int shmem_size_to_use = shmem_size; typedef KokkosKernels::Impl::UniformMemoryPool< MyTempMemorySpace, nnz_lno_t> pool_memory_space; From d2448f2943052642a03a50c95588e1728283dd40 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 14 Oct 2020 12:48:25 -0700 Subject: [PATCH 10/18] Factor out pool #chunks computation for SpGEMM (same code used in 7 places) --- src/sparse/impl/KokkosSparse_spgemm_impl.hpp | 27 ++++++++++++ .../KokkosSparse_spgemm_impl_compression.hpp | 21 +-------- .../impl/KokkosSparse_spgemm_impl_kkmem.hpp | 20 +-------- .../KokkosSparse_spgemm_impl_symbolic.hpp | 43 ++----------------- .../KokkosSparse_spgemm_impl_triangle.hpp | 21 ++------- ...se_spgemm_impl_triangle_no_compression.hpp | 19 ++------ ...kosSparse_spgemm_jacobi_sparseacc_impl.hpp | 18 +------- 7 files changed, 43 insertions(+), 126 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp index a8a539ef10..52ae067801 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp @@ -789,6 +789,33 @@ class KokkosSPGEMM{ }; +//Utility to compute the number of pool chunks for L2 hashmap accumulators. +//Uses free memory query for accelerators/GPUs but assumes infinite available host memory. +// +//chunk_bytes: bytes in each chunk +//ideal_num_chunks: number of chunks that would give each thread/team its own chunk (no contention) +template +size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks) +{ + if(!KokkosKernels::Impl::kk_is_gpu_exec_space()) + return ideal_num_chunks; + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); + size_t required_size = ideal_num_chunks * chunk_bytes; + if (KOKKOSKERNELS_VERBOSE) + std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; + size_t num_chunks = ideal_num_chunks; + //If there is not enough memory to safely allocate ideal_num_chunks, use half the free memory, rounded down + if (required_size > free_byte / 2) { + num_chunks = (free_byte / 2) / chunk_bytes; + } + //then take the largest power of 2 smaller than that + nnz_lno_t po2_num_chunks = 1; + while (po2_num_chunks * 2 < num_chunks) { + po2_num_chunks *= 2; + } + return po2_num_chunks; +} } } diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp index c881c98ed4..6936a49f15 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp @@ -860,26 +860,9 @@ bool KokkosSPGEMM sszm_compressMatrix.pow2_hash_size = min_hash_size; sszm_compressMatrix.pow2_hash_func = min_hash_size - 1; - size_t num_chunks = concurrency / suggested_vector_size; + nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); - - if (exec_gpu) { - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks*sizeof(int) > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - size_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } if (KOKKOSKERNELS_VERBOSE){ std::cout << "\t\tPOOL chunksize:" << chunksize << " num_chunks:" diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp index 38fce91b1b..e81b019e15 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp @@ -1421,25 +1421,9 @@ void chunksize += min_hash_size ; //this is for the hash begins chunksize += max_nnz; //this is for hash nexts } - int num_chunks = concurrency / suggested_vector_size; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } + nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); // END SIZE CALCULATIONS FOR MEMORYPOOL diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp index ec0c2034a2..4eb13d9b5e 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp @@ -1642,30 +1642,13 @@ void KokkosSPGEMM } //initizalize value for the mem pool - nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - - if (exec_gpu) { - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } + nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl; @@ -1970,31 +1953,13 @@ void KokkosSPGEMM std::cout << "\tDense Acc - COLS:" << col_size << " max_row_size:" << max_row_size << std::endl; } } - nnz_lno_t num_chunks = concurrency / suggested_vector_size; - KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - - if (exec_gpu) { - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } + nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << " chunksize:" << chunksize << std::endl; diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp index 27c0f4c7d9..6624343b52 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp @@ -1416,29 +1416,14 @@ void KokkosSPGEMM } - nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - if(exec_gpu) { - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 < num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } - } + nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); + if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp index ae913f864a..adc75d6eb2 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp @@ -963,26 +963,13 @@ void KokkosSPGEMM pool_init_val = 0; } - nnz_lno_t num_chunks = concurrency / suggested_vector_size; KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = size_t (num_chunks) * accumulator_chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / accumulator_chunksize; - } - { - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 < num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } } + nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); + if (KOKKOSKERNELS_VERBOSE){ std::cout << "\tPool Size (MB):" << (num_chunks * accumulator_chunksize * sizeof(nnz_lno_t)) / 1024. / 1024. << " num_chunks:" << num_chunks << diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp index 2e12457822..2140b8dc56 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp @@ -1401,23 +1401,9 @@ namespace KokkosSparse{ chunksize += min_hash_size ; //this is for the hash begins chunksize += max_nnz; //this is for hash nexts } - int num_chunks = concurrency / suggested_vector_size; - if (exec_gpu) { - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = size_t (num_chunks) * chunksize * sizeof(nnz_lno_t); - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - if (required_size + num_chunks > free_byte){ - num_chunks = ((((free_byte - num_chunks)* 0.5) /8 ) * 8) / sizeof(nnz_lno_t) / chunksize; - } - nnz_lno_t min_chunk_size = 1; - while (min_chunk_size * 2 <= num_chunks) { - min_chunk_size *= 2; - } - num_chunks = min_chunk_size; - } + nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ std::cout << "\t\t max_nnz: " << max_nnz From f4cacdc19ccf98c89f22e5d65246cf8782189433 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 14 Oct 2020 13:10:09 -0700 Subject: [PATCH 11/18] Made compute_num_pool_chunks a member of SpGEMM --- src/sparse/impl/KokkosSparse_spgemm_impl.hpp | 55 +++++++++---------- .../KokkosSparse_spgemm_impl_compression.hpp | 2 +- .../impl/KokkosSparse_spgemm_impl_kkmem.hpp | 2 +- .../KokkosSparse_spgemm_impl_symbolic.hpp | 4 +- .../KokkosSparse_spgemm_impl_triangle.hpp | 2 +- ...se_spgemm_impl_triangle_no_compression.hpp | 2 +- ...kosSparse_spgemm_jacobi_sparseacc_impl.hpp | 2 +- 7 files changed, 34 insertions(+), 35 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp index 52ae067801..19e576eb9d 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp @@ -787,35 +787,34 @@ class KokkosSPGEMM{ typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv, KokkosKernels::Impl::ExecSpaceType my_exec_space); -}; - -//Utility to compute the number of pool chunks for L2 hashmap accumulators. -//Uses free memory query for accelerators/GPUs but assumes infinite available host memory. -// -//chunk_bytes: bytes in each chunk -//ideal_num_chunks: number of chunks that would give each thread/team its own chunk (no contention) -template -size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks) -{ - if(!KokkosKernels::Impl::kk_is_gpu_exec_space()) - return ideal_num_chunks; - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - size_t required_size = ideal_num_chunks * chunk_bytes; - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; - size_t num_chunks = ideal_num_chunks; - //If there is not enough memory to safely allocate ideal_num_chunks, use half the free memory, rounded down - if (required_size > free_byte / 2) { - num_chunks = (free_byte / 2) / chunk_bytes; + //Utility to compute the number of pool chunks for L2 hashmap accumulators. + //Uses free memory query for accelerators/GPUs but assumes infinite available host memory. + // + //chunk_bytes: bytes in each chunk + //ideal_num_chunks: number of chunks that would give each thread/team its own chunk (no contention) + template + size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks) + { + if(!KokkosKernels::Impl::kk_is_gpu_exec_space()) + return ideal_num_chunks; + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); + size_t required_size = ideal_num_chunks * chunk_bytes; + if (KOKKOSKERNELS_VERBOSE) + std::cout << "\tmempool required size:" << required_size << " free_byte:" << free_byte << " total_byte:" << total_byte << std::endl; + size_t num_chunks = ideal_num_chunks; + //If there is not enough memory to safely allocate ideal_num_chunks, use half the free memory, rounded down + if (required_size > free_byte / 2) { + num_chunks = (free_byte / 2) / chunk_bytes; + } + //then take the largest power of 2 smaller than that + nnz_lno_t po2_num_chunks = 1; + while (po2_num_chunks * 2 < num_chunks) { + po2_num_chunks *= 2; + } + return po2_num_chunks; } - //then take the largest power of 2 smaller than that - nnz_lno_t po2_num_chunks = 1; - while (po2_num_chunks * 2 < num_chunks) { - po2_num_chunks *= 2; - } - return po2_num_chunks; -} +}; } } diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp index 6936a49f15..35f00201a2 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp @@ -860,7 +860,7 @@ bool KokkosSPGEMM sszm_compressMatrix.pow2_hash_size = min_hash_size; sszm_compressMatrix.pow2_hash_func = min_hash_size - 1; - nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + nnz_lno_t num_chunks = this->template compute_num_pool_chunks (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp index e81b019e15..a5fc298e2c 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp @@ -1422,7 +1422,7 @@ void chunksize += max_nnz; //this is for hash nexts } - nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + nnz_lno_t num_chunks = this->template compute_num_pool_chunks (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); // END SIZE CALCULATIONS FOR MEMORYPOOL diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp index 4eb13d9b5e..f6f4e8e3a8 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp @@ -1647,7 +1647,7 @@ void KokkosSPGEMM my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + nnz_lno_t num_chunks = this->template compute_num_pool_chunks (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ @@ -1958,7 +1958,7 @@ void KokkosSPGEMM my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + nnz_lno_t num_chunks = this->template compute_num_pool_chunks (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp index 6624343b52..c06d4c4cb2 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp @@ -1421,7 +1421,7 @@ void KokkosSPGEMM my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + nnz_lno_t num_chunks = this->template compute_num_pool_chunks (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp index adc75d6eb2..6a9b67c0b2 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp @@ -967,7 +967,7 @@ void KokkosSPGEMM if (exec_gpu) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } - nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + nnz_lno_t num_chunks = this->template compute_num_pool_chunks (accumulator_chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ diff --git a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp index 2140b8dc56..d4c2c98a6f 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp @@ -1402,7 +1402,7 @@ namespace KokkosSparse{ chunksize += max_nnz; //this is for hash nexts } - nnz_lno_t num_chunks = KokkosSparse::Impl::compute_num_pool_chunks + nnz_lno_t num_chunks = this->template compute_num_pool_chunks (chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); if (KOKKOSKERNELS_VERBOSE){ From 7aef9b13a1b38e19392074cbe18381802dfad115 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 14 Oct 2020 13:13:13 -0700 Subject: [PATCH 12/18] Fix signed vs. unsigned --- src/sparse/impl/KokkosSparse_spgemm_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp index 19e576eb9d..06a3153ad9 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp @@ -808,7 +808,7 @@ class KokkosSPGEMM{ num_chunks = (free_byte / 2) / chunk_bytes; } //then take the largest power of 2 smaller than that - nnz_lno_t po2_num_chunks = 1; + size_t po2_num_chunks = 1; while (po2_num_chunks * 2 < num_chunks) { po2_num_chunks *= 2; } From fd94bd47e3fa964a4054c510564fe1a3d83e2472 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 26 Oct 2020 21:24:56 -0600 Subject: [PATCH 13/18] WIP: improving performance of spmv for openmp --- perf_test/sparse/CMakeLists.txt | 5 + perf_test/sparse/KokkosSparse_kk_spmv.cpp | 186 ++++++++ src/common/KokkosKernels_ExecSpaceUtils.hpp | 20 +- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 427 ++++++++++++++---- .../impl/KokkosSparse_spmv_impl_omp.hpp | 1 - unit_test/sparse/Test_Sparse_spmv.hpp | 31 ++ 6 files changed, 576 insertions(+), 94 deletions(-) create mode 100644 perf_test/sparse/KokkosSparse_kk_spmv.cpp diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index da22993cda..f0662e4a08 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -43,6 +43,11 @@ KOKKOSKERNELS_ADD_EXECUTABLE( SOURCES KokkosSparse_spmv.cpp ) +KOKKOSKERNELS_ADD_EXECUTABLE( + sparse_kk_spmv + SOURCES KokkosSparse_kk_spmv.cpp + ) + IF(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) KOKKOSKERNELS_ADD_EXECUTABLE( sparse_spmv_merge diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp new file mode 100644 index 0000000000..07c29e3735 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -0,0 +1,186 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "KokkosKernels_default_types.hpp" + +typedef default_scalar Scalar; +typedef default_lno_t Ordinal; +typedef default_size_type Offset; + +template +void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, int num_vecs, char mode, Scalar beta) { + typedef KokkosSparse::CrsMatrix matrix_type; + typedef typename Kokkos::View mv_type; + typedef typename mv_type::HostMirror h_mv_type; + + srand(17312837); + matrix_type A; + if(filename) + A = KokkosKernels::Impl::read_kokkos_crst_matrix(filename); + else + { + Offset nnz = 10 * numRows; + //note: the help text says the bandwidth is fixed at 0.01 * numRows + A = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows, numCols, nnz, 0, 0.01 * numRows); + } + numRows = A.numRows(); + numCols = A.numCols(); + Offset nnz = A.nnz(); + mv_type x("X", numCols, num_vecs); + mv_type y("Y", numRows, num_vecs); + h_mv_type h_x = Kokkos::create_mirror_view(x); + h_mv_type h_y = Kokkos::create_mirror_view(y); + h_mv_type h_y_compare = Kokkos::create_mirror(y); + + for(int v = 0; v < num_vecs; v++) + { + for(int i=0; i::value) + layout = 'L'; + else + layout = 'R'; + int loop = 100; + int num_vecs = 1; + Scalar beta = 0.0; + + if(argc == 1) { + print_help(); + return 0; + } + + for(int i=0;i(size,size,filename,loop,num_vecs,mode,beta); + else + run_spmv(size,size,filename,loop,num_vecs,mode,beta); + + Kokkos::finalize(); +} + diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp index 22930c82e1..59bcf487fb 100644 --- a/src/common/KokkosKernels_ExecSpaceUtils.hpp +++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp @@ -55,7 +55,7 @@ namespace Impl{ enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA, Exec_HIP}; template -constexpr KOKKOS_INLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ +KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ ExecSpaceType exec_space = Exec_SERIAL; #if defined( KOKKOS_ENABLE_SERIAL ) if (std::is_same< Kokkos::Serial , ExecutionSpace >::value){ @@ -98,11 +98,23 @@ constexpr KOKKOS_INLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ template constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { - auto exec = kk_get_exec_space_type(); - //TODO BMK: Add OpenMPTarget and any other future GPU exec spaces - return exec == Exec_CUDA || exec == Exec_HIP; + return false; } +#ifdef KOKKOS_ENABLE_CUDA +template <> +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { + return true; +} +#endif + +#ifdef KOKKOS_ENABLE_HIP +template <> +constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { + return true; +} +#endif + //Host function to determine free and total device memory. //Will throw if execution space doesn't support this. template diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 1c011e42d9..558acc363a 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -171,10 +171,38 @@ struct SPMV_Functor { "YVector must be a rank 1 View."); } + KOKKOS_INLINE_FUNCTION + void operator() (const ordinal_type iRow) const + { + using y_value_type = typename YVector::non_const_value_type; + if (iRow >= m_A.numRows ()) { + return; + } + const KokkosSparse::SparseRowViewConst row = m_A.rowConst(iRow); + const ordinal_type row_length = static_cast (row.length); + y_value_type sum = 0; + + for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++) + { + const value_type val = conjugate ? + ATV::conj (row.value(iEntry)) : + row.value(iEntry); + sum += val * m_x(row.colidx(iEntry)); + } + + sum *= alpha; + + if (dobeta == 0) { + m_y(iRow) = sum ; + } else { + m_y(iRow) = beta * m_y(iRow) + sum; + } + } + KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { - typedef typename YVector::non_const_value_type y_value_type; + using y_value_type = typename YVector::non_const_value_type; Kokkos::parallel_for(Kokkos::TeamThreadRange(dev,0,rows_per_team), [&] (const ordinal_type& loop) { @@ -213,9 +241,19 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th if(nnz_per_row < 1) nnz_per_row = 1; + int max_vector_length = 1; +#ifdef KOKKOS_ENABLE_CUDA + if(std::is_same::value) + max_vector_length = 32; +#endif +#ifdef KOKKOS_ENABLE_HIP + if(std::is_same::value) + max_vector_length = 64; +#endif + if(vector_length < 1) { vector_length = 1; - while(vector_length<32 && vector_length*6 < nnz_per_row) + while(vector_length < max_vector_length && vector_length * 6 < nnz_per_row) vector_length*=2; } @@ -280,21 +318,14 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, ((int) A.graph.row_block_offsets.extent(0) == (int) omp_get_max_threads()+1) && (((uintptr_t)(const void*)(x.data())%64)==0) && (((uintptr_t)(const void*)(y.data())%64)==0) ) { + //Note BMK: this case is typically not called in practice even for OpenMP, since + //it requires row_block_offsets to have been computed in the graph. spmv_raw_openmp_no_transpose(alpha,A,x,beta,y); return; } #endif - int team_size = -1; - int vector_length = -1; - int64_t rows_per_thread = -1; - - // Note on 03/24/20, lbv: We can use the controls - // here to allow the user to pass in some tunning - // parameters. - if(controls.isParameter("team size")) {team_size = std::stoi(controls.getParameter("team size"));} - if(controls.isParameter("vector length")) {vector_length = std::stoi(controls.getParameter("vector length"));} - if(controls.isParameter("rows per thread")) {rows_per_thread = std::stoll(controls.getParameter("rows per thread"));} + bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule bool use_static_schedule = false; // Forces the use of a static schedule if(controls.isParameter("schedule")) { @@ -304,26 +335,45 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, use_static_schedule = true; } } - - int64_t rows_per_team = spmv_launch_parameters(A.numRows(),A.nnz(),rows_per_thread,team_size,vector_length); - int64_t worksets = (y.extent(0)+rows_per_team-1)/rows_per_team; - - SPMV_Functor func (alpha,A,x,beta,y,rows_per_team); - - if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule) { - Kokkos::TeamPolicy > policy(1,1); - if(team_size<0) - policy = Kokkos::TeamPolicy >(worksets,Kokkos::AUTO,vector_length); - else - policy = Kokkos::TeamPolicy >(worksets,team_size,vector_length); - Kokkos::parallel_for("KokkosSparse::spmv",policy,func); - } else { - Kokkos::TeamPolicy > policy(1,1); - if(team_size<0) - policy = Kokkos::TeamPolicy >(worksets,Kokkos::AUTO,vector_length); + if(use_teams) { + int team_size = -1; + int vector_length = -1; + int64_t rows_per_thread = -1; + + // Note on 03/24/20, lbv: We can use the controls + // here to allow the user to pass in some tunning + // parameters. + if(controls.isParameter("team size")) {team_size = std::stoi(controls.getParameter("team size"));} + if(controls.isParameter("vector length")) {vector_length = std::stoi(controls.getParameter("vector length"));} + if(controls.isParameter("rows per thread")) {rows_per_thread = std::stoll(controls.getParameter("rows per thread"));} + + int64_t rows_per_team = spmv_launch_parameters(A.numRows(),A.nnz(),rows_per_thread,team_size,vector_length); + int64_t worksets = (y.extent(0)+rows_per_team-1)/rows_per_team; + + SPMV_Functor func (alpha,A,x,beta,y,rows_per_team); + + if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::TeamPolicy > policy(1,1); + if(team_size<0) + policy = Kokkos::TeamPolicy >(worksets,Kokkos::AUTO,vector_length); + else + policy = Kokkos::TeamPolicy >(worksets,team_size,vector_length); + Kokkos::parallel_for("KokkosSparse::spmv",policy,func); + } else { + Kokkos::TeamPolicy > policy(1,1); + if(team_size<0) + policy = Kokkos::TeamPolicy >(worksets,Kokkos::AUTO,vector_length); + else + policy = Kokkos::TeamPolicy >(worksets,team_size,vector_length); + Kokkos::parallel_for("KokkosSparse::spmv",policy,func); + } + } + else { + SPMV_Functor func (alpha,A,x,beta,y,1); + if(((A.nnz()>10000000) || use_dynamic_schedule) && !use_static_schedule) + Kokkos::parallel_for("KokkosSparse::spmv",Kokkos::RangePolicy>(0, A.numRows()),func); else - policy = Kokkos::TeamPolicy >(worksets,team_size,vector_length); - Kokkos::parallel_for("KokkosSparse::spmv",policy,func); + Kokkos::parallel_for("KokkosSparse::spmv",Kokkos::RangePolicy>(0, A.numRows()),func); } } @@ -339,7 +389,8 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, typename YVector::const_value_type& beta, const YVector& y) { - typedef typename AMatrix::ordinal_type ordinal_type; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; if (A.numRows () <= static_cast (0)) { return; @@ -351,15 +402,23 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, KokkosBlas::scal (y, beta, y); } - typedef typename AMatrix::size_type size_type; - // Assuming that no row contains duplicate entries, NNZPerRow // cannot be more than the number of columns of the matrix. Thus, // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); + const ordinal_type NNZPerRow = A.nnz () / A.numRows (); int vector_length = 1; - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<32) ) vector_length*=2; + int max_vector_length = 1; +#ifdef KOKKOS_ENABLE_CUDA + if(std::is_same::value) + max_vector_length = 32; +#endif +#ifdef KOKKOS_ENABLE_HIP + if(std::is_same::value) + max_vector_length = 64; +#endif + while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) ) + vector_length*=2; typedef SPMV_Transpose_Functor OpType; @@ -367,9 +426,9 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, OpType op (alpha, A, x, y); - const int rows_per_thread = RowsPerThread (NNZPerRow); - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const int rows_per_team = rows_per_thread * team_size; + const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; op.rows_per_team = rows_per_team; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > @@ -626,6 +685,65 @@ struct SPMV_MV_LayoutLeft_Functor { } } + template + KOKKOS_INLINE_FUNCTION void + strip_mine (const ordinal_type& iRow, const ordinal_type& kk) const + { + y_value_type sum[UNROLL]; + +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + sum[k] = Kokkos::Details::ArithTraits::zero (); + } + + const auto row = m_A.rowConst (iRow); + + // The correct type of iEntry is ordinal_type, the type of the + // number of columns in the (local) matrix. This is because we + // assume either that rows have no duplicate entries, or that rows + // never have enough duplicate entries to overflow ordinal_type. + + for(ordinal_type iEntry = 0; iEntry < row.length; iEntry++) + { + const A_value_type val = conjugate ? + Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : + row.value(iEntry); + const ordinal_type ind = row.colidx(iEntry); +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + if(doalpha == 1) + sum[k] += val * m_x(ind, kk + k); + else if(doalpha == -1) + sum[k] -= val * m_x(ind, kk + k); + else + sum[k] += alpha * val * m_x(ind, kk + k); + } + } + + if(doalpha == -1) + + if (dobeta == 0) { + for(ordinal_type k = 0; k < UNROLL; k++) + m_y(iRow, kk + k) = sum[k]; + } else if (dobeta == 1) { + for(ordinal_type k = 0; k < UNROLL; k++) + m_y(iRow, kk + k) = m_y(iRow, kk + k) + sum[k]; + } else if (dobeta == -1) { + for(ordinal_type k = 0; k < UNROLL; k++) + m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; + } else { + for(ordinal_type k = 0; k < UNROLL; k++) + m_y(iRow, kk + k) = beta * m_y(iRow, kk + k) + sum[k]; + } + } + KOKKOS_INLINE_FUNCTION void strip_mine_1 (const team_member& dev, const ordinal_type& iRow) const { @@ -666,6 +784,141 @@ struct SPMV_MV_LayoutLeft_Functor { }); } + KOKKOS_INLINE_FUNCTION void + strip_mine_1 (const ordinal_type& iRow) const + { + const auto row = m_A.rowConst (iRow); + + // The correct type of iEntry is ordinal_type, the type of the + // number of columns in the (local) matrix. This is because we + // assume either that rows have no duplicate entries, or that rows + // never have enough duplicate entries to overflow ordinal_type. + + y_value_type sum = y_value_type(); + for(ordinal_type iEntry = 0; iEntry < row.length; iEntry++) + { + const A_value_type val = conjugate ? + Kokkos::Details::ArithTraits::conj (row.value(iEntry)) : + row.value(iEntry); + sum += val * m_x(row.colidx(iEntry),0); + } + if (doalpha == -1) { + sum = -sum; + } else if (doalpha != 1) { + sum *= alpha; + } + + if (dobeta == 0) { + m_y(iRow, 0) = sum ; + } else if (dobeta == 1) { + m_y(iRow, 0) += sum ; + } else if (dobeta == -1) { + m_y(iRow, 0) = -m_y(iRow, 0) + sum; + } else { + m_y(iRow, 0) = beta * m_y(iRow, 0) + sum; + } + } + + KOKKOS_INLINE_FUNCTION void + operator() (const ordinal_type& iRow) const + { + // mfh 20 Mar 2015, 07 Jun 2016: This is ordinal_type because it + // needs to have the same type as n. + ordinal_type kk = 0; + +#ifdef KOKKOS_FAST_COMPILE + for (; kk + 4 <= n; kk += 4) { + strip_mine<4>(dev, iRow, kk); + } + for( ; kk < n; ++kk) { + strip_mine<1>(dev, iRow, kk); + } +#else +# if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + if ((n > 8) && (n % 8 == 1)) { + strip_mine<9>(iRow, kk); + kk += 9; + } + for(; kk + 8 <= n; kk += 8) + strip_mine<8>(iRow, kk); + if(kk < n) { + switch(n - kk) { +# else // NOT a GPU + if ((n > 16) && (n % 16 == 1)) { + strip_mine<17>(iRow, kk); + kk += 17; + } + + for (; kk + 16 <= n; kk += 16) { + strip_mine<16>(iRow, kk); + } + + if(kk < n) { + switch(n - kk) { + case 15: + strip_mine<15>(iRow, kk); + break; + + case 14: + strip_mine<14>(iRow, kk); + break; + + case 13: + strip_mine<13>(iRow, kk); + break; + + case 12: + strip_mine<12>(iRow, kk); + break; + + case 11: + strip_mine<11>(iRow, kk); + break; + + case 10: + strip_mine<10>(iRow, kk); + break; + + case 9: + strip_mine<9>(iRow, kk); + break; + + case 8: + strip_mine<8>(iRow, kk); + break; +# endif // if/else: __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__ + case 7: + strip_mine<7>(iRow, kk); + break; + + case 6: + strip_mine<6>(iRow, kk); + break; + + case 5: + strip_mine<5>(iRow, kk); + break; + + case 4: + strip_mine<4>(iRow, kk); + break; + + case 3: + strip_mine<3>(iRow, kk); + break; + + case 2: + strip_mine<2>(iRow, kk); + break; + + case 1: + strip_mine_1(iRow); + break; + } + } +#endif // KOKKOS_FAST_COMPILE + } + KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const @@ -794,7 +1047,8 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a const typename YVector::non_const_value_type& beta, const YVector& y) { - typedef typename AMatrix::ordinal_type ordinal_type; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; if (A.numRows () <= static_cast (0)) { return; @@ -806,16 +1060,18 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a return; } else { - typedef typename AMatrix::size_type size_type; // Assuming that no row contains duplicate entries, NNZPerRow // cannot be more than the number of columns of the matrix. Thus, // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); + const ordinal_type NNZPerRow = A.nnz () / A.numRows (); - int vector_length = 1; - if(KokkosKernels::Impl::kk_is_gpu_exec_space()) - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); + ordinal_type vector_length = 1; + if(use_teams) { + while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) ) + vector_length *= 2; + } #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels @@ -825,17 +1081,17 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a typename AMatrix::const_ordinal_type nrow = A.numRows(); - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< typename AMatrix::execution_space >( 0, nrow ), op ); + } #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta @@ -846,18 +1102,18 @@ spmv_alpha_beta_mv_no_transpose (const typename YVector::non_const_value_type& a OpType op (alpha, A, x, beta, y, RowsPerThread (NNZPerRow), vector_length); - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const int rows_per_team = rows_per_thread * team_size; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); - + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< typename AMatrix::execution_space > + ( 0, nrow ) , op ); + } #endif // KOKKOS_FAST_COMPILE } } @@ -875,7 +1131,8 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph const typename YVector::non_const_value_type& beta, const YVector& y) { - typedef typename AMatrix::ordinal_type ordinal_type; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; if (A.numRows () <= static_cast (0)) { return; @@ -895,10 +1152,12 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph // the appropriate type is ordinal_type. const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); - int vector_length = 1; + ordinal_type vector_length = 1; //Transpose functor uses atomics which can't be vectorized on CPU - if(KokkosKernels::Impl::kk_is_gpu_exec_space()) - while( (static_cast (vector_length*2*3) <= NNZPerRow) && (vector_length<8) ) vector_length*=2; + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) ) + vector_length*=2; + } #ifndef KOKKOS_FAST_COMPILE // This uses templated functions on doalpha and dobeta and will produce 16 kernels @@ -906,16 +1165,11 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph doalpha, dobeta, conjugate> OpType; OpType op (alpha, A, x, beta, y); - typename AMatrix::const_ordinal_type nrow = A.numRows(); + const ordinal_type nrow = A.numRows(); - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const int rows_per_team = rows_per_thread * team_size; + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; op.rows_per_team = rows_per_team; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for ("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > @@ -930,14 +1184,9 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph OpType op (alpha, A, x, beta, y); - // FIXME (mfh 07 Jun 2016) Shouldn't we use ordinal_type here - // instead of int? For example, if the number of threads is 1, - // then this is just the number of rows. Ditto for rows_per_team. - // team_size is a hardware resource thing so it might legitimately - // be int. - const int rows_per_thread = RowsPerThread(NNZPerRow); - const int team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const int rows_per_team = rows_per_thread * team_size; + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; op.rows_per_team = rows_per_team; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > diff --git a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp index a4f1c07258..72c8a969fe 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp @@ -47,7 +47,6 @@ namespace Impl { #ifdef KOKKOS_ENABLE_OPENMP template void spmv_raw_openmp_no_transpose(typename YVector::const_value_type& s_a, AMatrix A, XVector x, typename YVector::const_value_type& s_b, YVector y) { - typedef typename YVector::non_const_value_type value_type; typedef typename AMatrix::ordinal_type ordinal_type; typedef typename AMatrix::non_const_size_type size_type; diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index e27012991a..598f906f8d 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -450,6 +450,36 @@ void test_spmv_mv(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_v } } +template +void test_spmv_mv_heavy(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_size_variance, int numMV){ + + typedef typename KokkosSparse::CrsMatrix crsMat_t; + + typedef Kokkos::View ViewTypeX; + typedef Kokkos::View ViewTypeY; + + crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows,numCols,nnz,row_size_variance, bandwidth); + Kokkos::Random_XorShift64_Pool rand_pool(13718); + + for(int nv = 1; nv <= numMV; nv++) { + ViewTypeX b_x("A",numRows,nv); + ViewTypeY b_y("B",numRows,nv); + ViewTypeY b_y_copy("B",numRows,nv); + + Kokkos::fill_random(b_x,rand_pool,scalar_t(10)); + Kokkos::fill_random(b_y,rand_pool,scalar_t(10)); + + Kokkos::deep_copy(b_y_copy, b_y); + + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'T'); + } +} + template void test_spmv_struct_1D(lno_t nx, lno_t leftBC, lno_t rightBC) { @@ -816,6 +846,7 @@ TEST_F( TestCategory,sparse ## _ ## spmv_mv ## _ ## SCALAR ## _ ## ORDINAL ## _ test_spmv_mv (50000, 50000 * 30, 100, 10, 5); \ test_spmv_mv (50000, 50000 * 30, 200, 10, 1); \ test_spmv_mv (10000, 10000 * 20, 100, 5, 10); \ + test_spmv_mv_heavy (200, 200 * 10, 60, 4, 50); \ } #define EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, DEVICE) \ From 792e48f6a6763b889d7c11d1a598218bdaf0ca4b Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 26 Oct 2020 21:46:41 -0600 Subject: [PATCH 14/18] Fixed typo in spmv --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 2 -- unit_test/sparse/Test_Sparse_spmv.hpp | 11 ++++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 558acc363a..55da2dea60 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -727,8 +727,6 @@ struct SPMV_MV_LayoutLeft_Functor { } } - if(doalpha == -1) - if (dobeta == 0) { for(ordinal_type k = 0; k < UNROLL; k++) m_y(iRow, kk + k) = sum[k]; diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index 598f906f8d..5a033fdf34 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -42,7 +42,7 @@ struct fSPMV { if(error > eps) { err++; - printf("expected_y(%d)=%f, y(%d)=%f\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i))); + //printf("expected_y(%d)=%f, y(%d)=%f\n", i, AT::abs(expected_y(i)), i, AT::abs(y(i))); } } }; @@ -203,8 +203,9 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vecto my_exec_space(0,y_i.extent(0)), fSPMV(y_i, y_spmv, eps), num_errors); - if(num_errors>0) printf("KokkosSparse::Test::spmv_mv: %i errors of %i for mv %i\n", - num_errors, y_i.extent_int(0), i); + if(num_errors>0) + std::cout << "KokkosSparse::Test::spmv_mv: " << num_errors << " errors of " << y_i.extent_int(0) + << " for mv " << i << " (alpha=" << alpha << ", beta=" << beta << ", mode = " << mode << ")\n"; EXPECT_TRUE(num_errors==0); } } @@ -458,7 +459,7 @@ void test_spmv_mv_heavy(lno_t numRows,size_type nnz, lno_t bandwidth, lno_t row_ typedef Kokkos::View ViewTypeX; typedef Kokkos::View ViewTypeY; - crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows,numCols,nnz,row_size_variance, bandwidth); + crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix(numRows,numRows,nnz,row_size_variance, bandwidth); Kokkos::Random_XorShift64_Pool rand_pool(13718); for(int nv = 1; nv <= numMV; nv++) { @@ -846,7 +847,7 @@ TEST_F( TestCategory,sparse ## _ ## spmv_mv ## _ ## SCALAR ## _ ## ORDINAL ## _ test_spmv_mv (50000, 50000 * 30, 100, 10, 5); \ test_spmv_mv (50000, 50000 * 30, 200, 10, 1); \ test_spmv_mv (10000, 10000 * 20, 100, 5, 10); \ - test_spmv_mv_heavy (200, 200 * 10, 60, 4, 50); \ + test_spmv_mv_heavy (200, 200 * 10, 60, 4, 30); \ } #define EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, DEVICE) \ From 55933aa0790b248c6171c080e238c87fb8108787 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 26 Oct 2020 22:50:44 -0600 Subject: [PATCH 15/18] Use range policy for omp mode T spmv/spmv_mv --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 123 ++++++++++++++++----- 1 file changed, 95 insertions(+), 28 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 55da2dea60..e3934e9d5d 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -105,6 +105,21 @@ struct SPMV_Transpose_Functor { alpha (alpha_), m_A (m_A_), m_x (m_x_), m_y (m_y_) {} + KOKKOS_INLINE_FUNCTION void + operator() (const ordinal_type iRow) const + { + const auto row = m_A.rowConst (iRow); + const ordinal_type row_length = row.length; + for(ordinal_type iEntry = 0; iEntry < row_length; iEntry++) + { + const value_type val = conjugate ? + ATV::conj (row.value(iEntry)) : + row.value(iEntry); + const ordinal_type ind = row.colidx(iEntry); + Kokkos::atomic_add (&m_y(ind), static_cast (alpha * val * m_x(iRow))); + } + } + KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { @@ -408,6 +423,7 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, const ordinal_type NNZPerRow = A.nnz () / A.numRows (); int vector_length = 1; + bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); int max_vector_length = 1; #ifdef KOKKOS_ENABLE_CUDA if(std::is_same::value) @@ -417,8 +433,10 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, if(std::is_same::value) max_vector_length = 64; #endif - while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) ) - vector_length*=2; + if(use_teams) { + while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) ) + vector_length*=2; + } typedef SPMV_Transpose_Functor OpType; @@ -426,14 +444,19 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, OpType op (alpha, A, x, y); - const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); - const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const ordinal_type rows_per_team = rows_per_thread * team_size; - op.rows_per_team = rows_per_team; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); - + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< typename AMatrix::execution_space > + ( 0 , nrow ) , op ); + } } template::conj (row.value(iEntry)) : + row.value(iEntry); + const ordinal_type ind = row.colidx(iEntry); + + if (doalpha != 1) { + #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL + #pragma unroll + #endif + for (ordinal_type k = 0; k < n; ++k) { + Kokkos::atomic_add (&m_y(ind,k), + static_cast (alpha * val * m_x(iRow, k))); + } + } else { + #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL + #pragma unroll + #endif + for (ordinal_type k = 0; k < n; ++k) { + Kokkos::atomic_add (&m_y(ind,k), + static_cast (val * m_x(iRow, k))); + } + } + } + } + KOKKOS_INLINE_FUNCTION void operator() (const team_member& dev) const { @@ -1151,8 +1207,9 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph const ordinal_type NNZPerRow = static_cast (A.nnz () / A.numRows ()); ordinal_type vector_length = 1; + bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); //Transpose functor uses atomics which can't be vectorized on CPU - if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if(use_teams) { while( (vector_length*2*3 <= NNZPerRow) && (vector_length<8) ) vector_length*=2; } @@ -1164,14 +1221,19 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph OpType op (alpha, A, x, beta, y); const ordinal_type nrow = A.numRows(); - - const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); - const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const ordinal_type rows_per_team = rows_per_thread * team_size; - op.rows_per_team = rows_per_team; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for ("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for ("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for ("KokkosSparse::spmv", Kokkos::RangePolicy < typename AMatrix::execution_space > + ( 0 , nrow ) , op ); + } #else // KOKKOS_FAST_COMPILE this will only instantiate one Kernel for alpha/beta @@ -1179,16 +1241,21 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph 2, 2, conjugate, SizeType> OpType; typename AMatrix::const_ordinal_type nrow = A.numRows(); + if(use_teams) { + OpType op (alpha, A, x, beta, y); - OpType op (alpha, A, x, beta, y); - - const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); - const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const ordinal_type rows_per_team = rows_per_thread * team_size; - op.rows_per_team = rows_per_team; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > - ( nteams , team_size , vector_length ) , op ); + const ordinal_type rows_per_thread = RowsPerThread(NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< typename AMatrix::execution_space > + ( 0, nrow ) , op ); + } #endif // KOKKOS_FAST_COMPILE } From 80fc49c1e16236f6b33115d972bbe584ac602c85 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 27 Oct 2020 08:58:56 -0600 Subject: [PATCH 16/18] Remove duplicate local typedef --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index e3934e9d5d..d42a0c81b2 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -1199,7 +1199,6 @@ spmv_alpha_beta_mv_transpose (const typename YVector::non_const_value_type& alph } if (doalpha != 0) { - typedef typename AMatrix::size_type size_type; // Assuming that no row contains duplicate entries, NNZPerRow // cannot be more than the number of columns of the matrix. Thus, From 1a35a8d33a6acceacae542700c1b1ad375bb0aa5 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 27 Oct 2020 09:05:09 -0600 Subject: [PATCH 17/18] Remove unused var --- perf_test/sparse/KokkosSparse_kk_spmv.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp index 07c29e3735..aa8f2ddfa3 100644 --- a/perf_test/sparse/KokkosSparse_kk_spmv.cpp +++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -80,7 +80,6 @@ void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, } numRows = A.numRows(); numCols = A.numCols(); - Offset nnz = A.nnz(); mv_type x("X", numCols, num_vecs); mv_type y("Y", numRows, num_vecs); h_mv_type h_x = Kokkos::create_mirror_view(x); From d3909de4a28049a507e8983cbac6febecb188010 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 27 Oct 2020 10:03:34 -0600 Subject: [PATCH 18/18] Fix execution_space typedef --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index d42a0c81b2..7b91f95e09 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -406,6 +406,7 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, { using ordinal_type = typename AMatrix::non_const_ordinal_type; using size_type = typename AMatrix::non_const_size_type; + using execution_space = typename AMatrix::execution_space; if (A.numRows () <= static_cast (0)) { return; @@ -423,7 +424,7 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, const ordinal_type NNZPerRow = A.nnz () / A.numRows (); int vector_length = 1; - bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); + bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); int max_vector_length = 1; #ifdef KOKKOS_ENABLE_CUDA if(std::is_same::value) @@ -445,16 +446,16 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, OpType op (alpha, A, x, y); if(use_teams) { - const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); - const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); const ordinal_type rows_per_team = rows_per_thread * team_size; op.rows_per_team = rows_per_team; const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< typename AMatrix::execution_space > + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< execution_space > ( nteams , team_size , vector_length ) , op ); } else { - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< typename AMatrix::execution_space > + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< execution_space > ( 0 , nrow ) , op ); } }