From 92267d98a8c5f697473079b8af2061bd81befc1f Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 1 Aug 2024 15:09:15 -0600 Subject: [PATCH] Final cleanup, formatting --- perf_test/sparse/CMakeLists.txt | 10 +- sparse/impl/KokkosSparse_sort_crs_impl.hpp | 9 ++ sparse/src/KokkosSparse_SortCrs.hpp | 109 ++++++++++++--------- sparse/src/KokkosSparse_Utils.hpp | 23 ++++- sparse/unit_test/Test_Sparse.hpp | 52 +++++----- 5 files changed, 123 insertions(+), 80 deletions(-) diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index 10c23d678e..231001241f 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -116,10 +116,12 @@ KOKKOSKERNELS_ADD_EXECUTABLE( SOURCES KokkosSparse_mdf.cpp ) -KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_sort_crs - SOURCES KokkosSparse_sort_crs.cpp -) +# Do not build this CRS sorting perf test by default. +# It can be enabled if needed by uncommenting these lines. +#KOKKOSKERNELS_ADD_EXECUTABLE( +# sparse_sort_crs +# SOURCES KokkosSparse_sort_crs.cpp +#) if (KokkosKernels_ENABLE_BENCHMARK) KOKKOSKERNELS_ADD_BENCHMARK( diff --git a/sparse/impl/KokkosSparse_sort_crs_impl.hpp b/sparse/impl/KokkosSparse_sort_crs_impl.hpp index fdb1f4f50c..a0dded0628 100644 --- a/sparse/impl/KokkosSparse_sort_crs_impl.hpp +++ b/sparse/impl/KokkosSparse_sort_crs_impl.hpp @@ -316,6 +316,15 @@ void applyPermutationBlockValues(const ExecSpace& exec, out(i) = in(permutation(blockIndex) * scalarsPerBlock + offsetInBlock); }); } + +// Heuristic for choosing bulk sorting algorithm +template +bool useBulkSortHeuristic(Ordinal avgDeg, Ordinal maxDeg) { + // Use bulk sort if matrix is highly imbalanced, + // OR the longest rows have many entries. + return (maxDeg / 10 > avgDeg) || (maxDeg > 1024); +} + } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_SortCrs.hpp b/sparse/src/KokkosSparse_SortCrs.hpp index fd4e6a7147..a85a260ce0 100644 --- a/sparse/src/KokkosSparse_SortCrs.hpp +++ b/sparse/src/KokkosSparse_SortCrs.hpp @@ -17,6 +17,7 @@ #define _KOKKOSSPARSE_SORTCRS_HPP #include "KokkosSparse_sort_crs_impl.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosKernels_Sorting.hpp" namespace KokkosSparse { @@ -64,15 +65,17 @@ void sort_crs_matrix( "sort_crs_matrix: entries_t must not be const-valued"); static_assert(!std::is_const_v, "sort_crs_matrix: value_t must not be const-valued"); - using Ordinal = typename entries_t::non_const_value_type; - using Scalar = typename values_t::non_const_value_type; + using Offset = typename rowmap_t::non_const_value_type; + using Ordinal = typename entries_t::non_const_value_type; + using Scalar = typename values_t::non_const_value_type; + // This early return condition covers having 0 or 1 entries, + // which is also implied by having 0 rows or 0 columns. + // If only 1 entry, the matrix is already sorted. if (entries.extent(0) <= size_t(1)) { return; } Ordinal numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; - if (numRows == 0) return; if constexpr (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { - using Offset = typename rowmap_t::non_const_value_type; using UnsignedOrdinal = typename std::make_unsigned::type; using entries_managed_t = Kokkos::View; @@ -100,21 +103,46 @@ void sort_crs_matrix( values.data() + rowStart, valuesAux.data() + rowStart, rowNum); }); } else { - // On GPUs, prefer to do a single bulk sort. - if (numCols == Kokkos::ArithTraits::max()) { - KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0), entries, - numCols); - numCols++; + // On GPUs: + // If the matrix is highly imbalanced, or has long rows AND the dimensions + // are not too large to do one large bulk sort, do that. Otherwise, sort + // using one Kokkos thread per row. + Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows; + Ordinal maxDeg = KokkosSparse::Impl::graph_max_degree(exec, rowmap); + bool useBulkSort = false; + if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) { + // Calculate the true number of columns if user didn't pass it in + if (numCols == Kokkos::ArithTraits::max()) { + KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0), + entries, numCols); + numCols++; + } + uint64_t maxBulkKey = (uint64_t)numRows * (uint64_t)numCols; + useBulkSort = maxBulkKey / numRows == (uint64_t)numCols; } - uint64_t maxBulkKey = (uint64_t)numRows * (uint64_t)numCols; - if (maxBulkKey / numRows != (uint64_t)numCols) { - using Offset = typename rowmap_t::non_const_value_type; + if (useBulkSort) { + auto permutation = KokkosSparse::Impl::computeEntryPermutation( + exec, rowmap, entries, numCols); + // Permutations cannot be done in-place + Kokkos::View origValues( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "origValues"), + values.extent(0)); + Kokkos::View + origEntries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "origEntries"), + entries.extent(0)); + Kokkos::deep_copy(exec, origValues, values); + Kokkos::deep_copy(exec, origEntries, entries); + KokkosSparse::Impl::applyPermutation(exec, permutation, origEntries, + entries); + KokkosSparse::Impl::applyPermutation(exec, permutation, origValues, + values); + } else { using TeamPol = Kokkos::TeamPolicy; using TeamMem = typename TeamPol::member_type; // Can't use bulk sort approach as matrix dimensions are too large. // Fall back to parallel thread-level sort within each row. Ordinal vectorLength = 1; - Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows; while (vectorLength < avgDeg / 2) { vectorLength *= 2; } @@ -130,23 +158,6 @@ void sort_crs_matrix( TeamPol(exec, (numRows + teamSize - 1) / teamSize, teamSize, vectorLength), funct); - } else { - auto permutation = KokkosSparse::Impl::computeEntryPermutation( - exec, rowmap, entries, numCols); - // Permutations cannot be done in-place - Kokkos::View origValues( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "origValues"), - values.extent(0)); - Kokkos::View - origEntries( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "origEntries"), - entries.extent(0)); - Kokkos::deep_copy(exec, origValues, values); - Kokkos::deep_copy(exec, origEntries, entries); - KokkosSparse::Impl::applyPermutation(exec, permutation, origEntries, - entries); - KokkosSparse::Impl::applyPermutation(exec, permutation, origValues, - values); } } } @@ -263,6 +274,7 @@ void sort_crs_graph( typename entries_t::non_const_value_type numCols = Kokkos::ArithTraits::max()) { using Ordinal = typename entries_t::non_const_value_type; + using Offset = typename rowmap_t::non_const_value_type; static_assert( Kokkos::SpaceAccessibility::accessible, @@ -286,7 +298,6 @@ void sort_crs_graph( using entries_managed_t = Kokkos::View; using UnsignedOrdinal = typename std::make_unsigned::type; - using Offset = typename rowmap_t::non_const_value_type; entries_managed_t entriesAux( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), entries.extent(0)); @@ -304,22 +315,32 @@ void sort_crs_graph( (UnsignedOrdinal*)entriesAux.data() + rowStart, rowNum); }); } else { - // On GPU, prefer to sort all entries in one bulk sort. - // This is only possible if (row, col) pairs can be numbered using uint64_t - if (numCols == Kokkos::ArithTraits::max()) { - KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0), entries, - numCols); - numCols++; + // On GPUs: + // If the graph is highly imbalanced AND the dimensions are not too large + // to do one large bulk sort, do that. Otherwise, sort using one Kokkos + // thread per row. + Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows; + Ordinal maxDeg = KokkosSparse::Impl::graph_max_degree(exec, rowmap); + bool useBulkSort = false; + if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) { + // Calculate the true number of columns if user didn't pass it in + if (numCols == Kokkos::ArithTraits::max()) { + KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0), + entries, numCols); + numCols++; + } + uint64_t maxBulkKey = (uint64_t)numRows * (uint64_t)numCols; + useBulkSort = maxBulkKey / numRows == (uint64_t)numCols; } - uint64_t maxBulkKey = (uint64_t)numRows * (uint64_t)numCols; - // Check if the multiplication above overflowed - if (maxBulkKey / numRows != (uint64_t)numCols) { - using Offset = typename rowmap_t::non_const_value_type; + if (useBulkSort) { + auto keys = KokkosSparse::Impl::generateBulkCrsKeys(exec, rowmap, entries, + numCols); + Kokkos::Experimental::sort_by_key(exec, keys, entries); + } else { using TeamPol = Kokkos::TeamPolicy; using TeamMem = typename TeamPol::member_type; // Fall back to thread-level sort within each row Ordinal vectorLength = 1; - Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows; while (vectorLength < avgDeg / 2) { vectorLength *= 2; } @@ -335,10 +356,6 @@ void sort_crs_graph( TeamPol(exec, (numRows + teamSize - 1) / teamSize, teamSize, vectorLength), funct); - } else { - auto keys = KokkosSparse::Impl::generateBulkCrsKeys(exec, rowmap, entries, - numCols); - Kokkos::Experimental::sort_by_key(exec, keys, entries); } } } diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 876ba0fc58..234a9b45f8 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -981,6 +981,21 @@ ordinal_t graph_max_degree(const rowmap_t &rowmap) { return val; } +template +typename rowmap_t::non_const_value_type graph_max_degree( + const execution_space &exec, const rowmap_t &rowmap) { + using Offset = typename rowmap_t::non_const_value_type; + using Reducer = Kokkos::Max; + Offset nrows = rowmap.extent(0); + if (nrows) nrows--; + if (nrows == 0) return 0; + Offset val; + Kokkos::parallel_reduce(Kokkos::RangePolicy(exec, 0, nrows), + MaxDegreeFunctor(rowmap), + Reducer(val)); + return val; +} + template void graph_min_max_degree(const rowmap_t &rowmap, ordinal_t &min_degree, ordinal_t &max_degree) { @@ -1625,8 +1640,8 @@ void kk_create_incidence_matrix_from_original_matrix( lno_t col_perm = col; if (perm) col_perm = perm[col]; if (row_perm > col_perm) { - typedef typename std::remove_reference::type atomic_incr_type; + typedef typename std::remove_reference< + decltype(out_rowmap_copy[0])>::type atomic_incr_type; size_type row_write_index = Kokkos::atomic_fetch_add( &(out_rowmap_copy[row]), atomic_incr_type(1)); size_type col_write_index = Kokkos::atomic_fetch_add( @@ -1658,8 +1673,8 @@ void kk_create_incidence_matrix_from_original_matrix( lno_t col_perm = col; if (perm) col_perm = perm[col]; if (row_perm < col_perm) { - typedef typename std::remove_reference::type atomic_incr_type; + typedef typename std::remove_reference< + decltype(out_rowmap_copy[0])>::type atomic_incr_type; size_type row_write_index = Kokkos::atomic_fetch_add( &(out_rowmap_copy[row]), atomic_incr_type(1)); size_type col_write_index = Kokkos::atomic_fetch_add( diff --git a/sparse/unit_test/Test_Sparse.hpp b/sparse/unit_test/Test_Sparse.hpp index f27b02ec99..3663122e92 100644 --- a/sparse/unit_test/Test_Sparse.hpp +++ b/sparse/unit_test/Test_Sparse.hpp @@ -16,33 +16,33 @@ #ifndef TEST_SPARSE_HPP #define TEST_SPARSE_HPP -//#include "Test_Sparse_coo2crs.hpp" -//#include "Test_Sparse_crs2coo.hpp" -//#include "Test_Sparse_Controls.hpp" -//#include "Test_Sparse_CrsMatrix.hpp" -//#include "Test_Sparse_mdf.hpp" -//#include "Test_Sparse_findRelOffset.hpp" -//#include "Test_Sparse_gauss_seidel.hpp" -//#include "Test_Sparse_MergeMatrix.hpp" -//#include "Test_Sparse_replaceSumInto.hpp" -//#include "Test_Sparse_replaceSumIntoLonger.hpp" -//#include "Test_Sparse_spadd.hpp" -//#include "Test_Sparse_spgemm_jacobi.hpp" -//#include "Test_Sparse_spgemm.hpp" +#include "Test_Sparse_coo2crs.hpp" +#include "Test_Sparse_crs2coo.hpp" +#include "Test_Sparse_Controls.hpp" +#include "Test_Sparse_CrsMatrix.hpp" +#include "Test_Sparse_mdf.hpp" +#include "Test_Sparse_findRelOffset.hpp" +#include "Test_Sparse_gauss_seidel.hpp" +#include "Test_Sparse_MergeMatrix.hpp" +#include "Test_Sparse_replaceSumInto.hpp" +#include "Test_Sparse_replaceSumIntoLonger.hpp" +#include "Test_Sparse_spadd.hpp" +#include "Test_Sparse_spgemm_jacobi.hpp" +#include "Test_Sparse_spgemm.hpp" #include "Test_Sparse_SortCrs.hpp" -//#include "Test_Sparse_spiluk.hpp" -//#include "Test_Sparse_spmv.hpp" -//#include "Test_Sparse_sptrsv.hpp" -//#include "Test_Sparse_trsv.hpp" -//#include "Test_Sparse_par_ilut.hpp" -//#include "Test_Sparse_gmres.hpp" -//#include "Test_Sparse_Transpose.hpp" -//#include "Test_Sparse_TestUtils_RandCsMat.hpp" -//#include "Test_Sparse_IOUtils.hpp" -//#include "Test_Sparse_ccs2crs.hpp" -//#include "Test_Sparse_crs2ccs.hpp" -//#include "Test_Sparse_removeCrsMatrixZeros.hpp" -//#include "Test_Sparse_extractCrsDiagonalBlocks.hpp" +#include "Test_Sparse_spiluk.hpp" +#include "Test_Sparse_spmv.hpp" +#include "Test_Sparse_sptrsv.hpp" +#include "Test_Sparse_trsv.hpp" +#include "Test_Sparse_par_ilut.hpp" +#include "Test_Sparse_gmres.hpp" +#include "Test_Sparse_Transpose.hpp" +#include "Test_Sparse_TestUtils_RandCsMat.hpp" +#include "Test_Sparse_IOUtils.hpp" +#include "Test_Sparse_ccs2crs.hpp" +#include "Test_Sparse_crs2ccs.hpp" +#include "Test_Sparse_removeCrsMatrixZeros.hpp" +#include "Test_Sparse_extractCrsDiagonalBlocks.hpp" // TPL specific tests, these require // particular pairs of backend and TPL