From d362576a1677b71390ee2d87f4432a3cbae49fe1 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 8 Apr 2021 17:16:46 -0600 Subject: [PATCH 1/3] Move sorting functionality out of Impl:: Make sorting (CRS, device/team bitonic, serial radix) a public feature for users. Moved all of the CRS sorting stuff out of SparseUtils and into KokkosKernels_Sorting.hpp, so that including KokkosKernels_Utils.hpp doesn't require compiling all the sorting stuff. --- perf_test/graph/KokkosGraph_run_triangle.hpp | 37 +- .../sparse/KokkosSparse_multimem_spgemm.hpp | 29 +- perf_test/sparse/KokkosSparse_run_spgemm.hpp | 43 +- .../sparse/KokkosSparse_run_spgemm_jacobi.hpp | 47 +- src/common/KokkosKernels_Sorting.hpp | 866 ++++++++++++++---- src/common/KokkosKernels_SparseUtils.hpp | 553 +---------- src/graph/KokkosGraph_ExplicitCoarsening.hpp | 6 +- src/sparse/KokkosSparse_spadd.hpp | 2 +- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 4 +- ...okkosSparse_twostage_gauss_seidel_impl.hpp | 4 +- test_common/KokkosKernels_MatrixConverter.cpp | 24 +- unit_test/common/Test_Common_Sorting.hpp | 6 +- unit_test/common/Test_Common_Transpose.hpp | 6 +- unit_test/sparse/Test_Sparse_spgemm.hpp | 45 +- .../sparse/Test_Sparse_spgemm_jacobi.hpp | 44 +- 15 files changed, 742 insertions(+), 974 deletions(-) diff --git a/perf_test/graph/KokkosGraph_run_triangle.hpp b/perf_test/graph/KokkosGraph_run_triangle.hpp index 09a651f04d..ff5170e938 100644 --- a/perf_test/graph/KokkosGraph_run_triangle.hpp +++ b/perf_test/graph/KokkosGraph_run_triangle.hpp @@ -67,39 +67,16 @@ bool is_same_graph(crsGraph_t output_mat1, crsGraph_t output_mat2){ size_t nentries2 = output_mat2.entries.extent(0) ; //size_t nvals2 = output_mat2.values.extent(0); - - lno_nnz_view_t h_ent1 (Kokkos::ViewAllocateWithoutInitializing("e1"), nentries1); - lno_nnz_view_t h_vals1 (Kokkos::ViewAllocateWithoutInitializing("v1"), nentries1); - - - KokkosKernels::Impl::kk_sort_graph( - output_mat1.row_map, output_mat1.entries,h_vals1, - h_ent1, h_vals1 - ); - - lno_nnz_view_t h_ent2 (Kokkos::ViewAllocateWithoutInitializing("e1"), nentries2); - lno_nnz_view_t h_vals2 (Kokkos::ViewAllocateWithoutInitializing("v1"), nentries2); + KokkosKernels::sort_crs_graph + (output_mat1.graph.row_map, output_mat1.entries); if (nrows1 != nrows2) return false; if (nentries1 != nentries2) return false; - KokkosKernels::Impl::kk_sort_graph - ( - output_mat2.row_map, output_mat2.entries, h_vals2, - h_ent2, h_vals2 - ); + KokkosKernels::sort_crs_graph + (output_mat2.graph.row_map, output_mat2.entries); bool is_identical = true; is_identical = KokkosKernels::Impl::kk_is_identical_view @@ -109,7 +86,7 @@ bool is_same_graph(crsGraph_t output_mat1, crsGraph_t output_mat2){ is_identical = KokkosKernels::Impl::kk_is_identical_view (h_ent1, h_ent2, 0 ); + typename device::execution_space>(output_mat1.entries, output_mat2.entries, 0 ); if (!is_identical) return false; if (!is_identical) { diff --git a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp index 0d285cbd76..f31b611795 100644 --- a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp +++ b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp @@ -248,41 +248,26 @@ namespace Experiment{ if (c_mat_file != NULL){ if (params.c_mem_space == 1){ - - fast_cols_view_t sorted_adj("sorted adj", c_fast_crsmat.graph.entries.extent(0)); - fast_values_view_t sorted_vals("sorted vals", c_fast_crsmat.graph.entries.extent(0)); - - KokkosKernels::Impl::kk_sort_graph - ( - c_fast_crsmat.graph.row_map, - c_fast_crsmat.graph.entries, - c_fast_crsmat.values, sorted_adj, sorted_vals); + KokkosKernels::sort_crs_matrix(c_fast_crsmat); KokkosKernels::Impl::write_graph_bin( (lno_t) (c_fast_crsmat.numRows()), (size_type) (c_fast_crsmat.graph.entries.extent(0)), c_fast_crsmat.graph.row_map.data(), - sorted_adj.data(), - sorted_vals.data(), + c_fast_crsmat.graph.entries.data(), + c_fast_crsmat.values.data(), c_mat_file); } else { - slow_cols_view_t sorted_adj("sorted adj", c_fast_crsmat.graph.entries.extent(0)); - slow_values_view_t sorted_vals("sorted vals", c_fast_crsmat.graph.entries.extent(0)); - - KokkosKernels::Impl::kk_sort_graph< - const_slow_row_map_view_t, const_slow_cols_view_t, const_slow_values_view_t, slow_cols_view_t, slow_values_view_t, myExecSpace>( - c_slow_crsmat.graph.row_map, - c_slow_crsmat.graph.entries, - c_slow_crsmat.values, sorted_adj, sorted_vals); + KokkosKernels::sort_crs_matrix(c_slow_crsmat); KokkosKernels::Impl::write_graph_bin( (lno_t) c_slow_crsmat.numRows(), (size_type) c_slow_crsmat.graph.entries.extent(0), c_slow_crsmat.graph.row_map.data(), - sorted_adj.data(), - sorted_vals.data(), - c_mat_file); + c_slow_crsmat.graph.entries.data(), + c_slow_crsmat.values.data(), + c_mat_file); } } } diff --git a/perf_test/sparse/KokkosSparse_run_spgemm.hpp b/perf_test/sparse/KokkosSparse_run_spgemm.hpp index decb2cf07c..d04eb7104d 100644 --- a/perf_test/sparse/KokkosSparse_run_spgemm.hpp +++ b/perf_test/sparse/KokkosSparse_run_spgemm.hpp @@ -45,7 +45,7 @@ #include "KokkosSparse_spgemm.hpp" #include "KokkosKernels_TestParameters.hpp" - +#include "KokkosKernels_Sorting.hpp" #define TRANPOSEFIRST false #define TRANPOSESECOND false @@ -69,24 +69,7 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2){ size_t nentries2 = output_mat2.graph.entries.extent(0) ; size_t nvals2 = output_mat2.values.extent(0); - - lno_nnz_view_t h_ent1 (Kokkos::ViewAllocateWithoutInitializing("e1"), nentries1); - scalar_view_t h_vals1 (Kokkos::ViewAllocateWithoutInitializing("v1"), nvals1); - - - KokkosKernels::Impl::kk_sort_graph( - output_mat1.graph.row_map, output_mat1.graph.entries, output_mat1.values, - h_ent1, h_vals1 - ); - - lno_nnz_view_t h_ent2 (Kokkos::ViewAllocateWithoutInitializing("e1"), nentries2); - scalar_view_t h_vals2 (Kokkos::ViewAllocateWithoutInitializing("v1"), nvals2); + KokkosKernels::sort_crs_matrix(output_mat1); if (nrows1 != nrows2) { std::cerr <<"row count is different" << std::endl; @@ -101,17 +84,7 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2){ return false; } - KokkosKernels::Impl::kk_sort_graph - ( - output_mat2.graph.row_map, output_mat2.graph.entries, output_mat2.values, - h_ent2, h_vals2 - ); + KokkosKernels::sort_crs_matrix(output_mat2); bool is_identical = true; is_identical = KokkosKernels::Impl::kk_is_identical_view @@ -124,21 +97,23 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2){ is_identical = KokkosKernels::Impl::kk_is_identical_view (h_ent1, h_ent2, 0 ); + typename device::execution_space>(output_mat1.graph.entries, output_mat2.graph.entries, 0 ); if (!is_identical) { for (size_t i = 0; i < nrows1; ++i){ size_t rb = output_mat1.graph.row_map[i]; size_t re = output_mat1.graph.row_map[i + 1]; bool incorrect =false; for (size_t j = rb; j < re; ++j){ - if (h_ent1[j] != h_ent2[j]){ + if (output_mat1.graph.entries[j] != output_mat2.graph.entries[j]){ incorrect = true; break; } } if (incorrect){ for (size_t j = rb; j < re; ++j){ - std::cerr << "row:" << i << " j:" << j << " h_ent1[j]:" << h_ent1[j] << " h_ent2[j]:" << h_ent2[j] << " rb:" << rb << " re:" << re<< std::endl; + std::cerr << "row:" << i << " j:" << j << + " h_ent1[j]:" << output_mat1.graph.entries(j) << " h_ent2[j]:" << output_mat2.graph.entries[j] << + " rb:" << rb << " re:" << re << std::endl; } } @@ -149,7 +124,7 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2){ is_identical = KokkosKernels::Impl::kk_is_identical_view (h_vals1, h_vals2, 0.000001); + typename device::execution_space>(output_mat1.values, output_mat2.values, 0.000001); if (!is_identical) { std::cerr << "Incorret values" << std::endl; } diff --git a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp index 14a10d5dd9..8e50f3e879 100644 --- a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp +++ b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp @@ -45,6 +45,7 @@ #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_TestParameters.hpp" #include "KokkosSparse_spgemm.hpp" +#include "KokkosKernels_Sorting.hpp" #define TRANSPOSEFIRST false #define TRANSPOSESECOND false @@ -69,16 +70,7 @@ namespace KokkosKernels{ size_t nentries2 = output_mat2.graph.entries.extent(0); size_t nvals2 = output_mat2.values.extent(0); - lno_nnz_view_t h_ent1 (Kokkos::ViewAllocateWithoutInitializing("e1"), nentries1); - scalar_view_t h_vals1 (Kokkos::ViewAllocateWithoutInitializing("v1"), nvals1); - - KokkosKernels::Impl::kk_sort_graph(output_mat1.graph.row_map, output_mat1.graph.entries, output_mat1.values, - h_ent1, h_vals1); - - lno_nnz_view_t h_ent2 (Kokkos::ViewAllocateWithoutInitializing("e1"), nentries2); - scalar_view_t h_vals2 (Kokkos::ViewAllocateWithoutInitializing("v1"), nvals2); + KokkosKernels::sort_crs_matrix(output_mat1); if (nrows1 != nrows2) { std::cerr <<"row count is different" << std::endl; @@ -93,10 +85,7 @@ namespace KokkosKernels{ return false; } - KokkosKernels::Impl::kk_sort_graph(output_mat2.graph.row_map, output_mat2.graph.entries, output_mat2.values, - h_ent2, h_vals2); + KokkosKernels::sort_crs_matrix(output_mat2); bool is_identical = true; is_identical = KokkosKernels::Impl::kk_is_identical_view @@ -109,7 +98,7 @@ namespace KokkosKernels{ is_identical = KokkosKernels::Impl::kk_is_identical_view (h_ent1, h_ent2, 0); + typename device::execution_space>(output_mat1.graph.entries, output_mat2.graph.entries, 0); if (!is_identical) { std::cerr << "entries are different" << std::endl; @@ -118,7 +107,7 @@ namespace KokkosKernels{ is_identical = KokkosKernels::Impl::kk_is_identical_view (h_vals1, h_vals2, 0.00001); + typename device::execution_space>(output_mat1.values, output_mat2.values, 0.00001); if (!is_identical) { std::cerr << "values are different" << std::endl; } @@ -505,37 +494,23 @@ namespace KokkosKernels{ if (c_mat_file != NULL){ if (params.c_mem_space == 1){ - fast_cols_view_t sorted_adj("sorted adj", c_fast_crsmat.graph.entries.extent(0)); - fast_values_view_t sorted_vals("sorted vals", c_fast_crsmat.graph.entries.extent(0)); - - KokkosKernels::Impl::kk_sort_graph - (c_fast_crsmat.graph.row_map, - c_fast_crsmat.graph.entries, - c_fast_crsmat.values, sorted_adj, sorted_vals); + KokkosKernels::sort_crs_matrix(c_fast_crsmat); KokkosKernels::Impl::write_graph_bin((lno_t) (c_fast_crsmat.numRows()), (size_type) (c_fast_crsmat.graph.entries.extent(0)), c_fast_crsmat.graph.row_map.data(), - sorted_adj.data(), - sorted_vals.data(), + c_fast_crsmat.graph.entries.data(), + c_fast_crsmat.values.data(), c_mat_file); } else { - slow_cols_view_t sorted_adj("sorted adj", c_fast_crsmat.graph.entries.extent(0)); - slow_values_view_t sorted_vals("sorted vals", c_fast_crsmat.graph.entries.extent(0)); - - KokkosKernels::Impl::kk_sort_graph - (c_slow_crsmat.graph.row_map, - c_slow_crsmat.graph.entries, - c_slow_crsmat.values, sorted_adj, sorted_vals); + KokkosKernels::sort_crs_matrix(c_slow_crsmat); KokkosKernels::Impl::write_graph_bin((lno_t) c_slow_crsmat.numRows(), (size_type) c_slow_crsmat.graph.entries.extent(0), c_slow_crsmat.graph.row_map.data(), - sorted_adj.data(), - sorted_vals.data(), + c_slow_crsmat.graph.entries.data(), + c_slow_crsmat.values.data(), c_mat_file); } } diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp index be37765594..0bfc1289b8 100644 --- a/src/common/KokkosKernels_Sorting.hpp +++ b/src/common/KokkosKernels_Sorting.hpp @@ -43,15 +43,664 @@ */ #ifndef _KOKKOSKERNELS_SORTING_HPP #define _KOKKOSKERNELS_SORTING_HPP + #include "Kokkos_Core.hpp" -#include "Kokkos_Atomic.hpp" -#include "Kokkos_ArithTraits.hpp" -#include "impl/Kokkos_Timer.hpp" +#include "KokkosKernels_SimpleUtils.hpp" //for kk_exclusive_parallel_prefix_sum +#include "KokkosKernels_ExecSpaceUtils.hpp"//for kk_is_gpu_exec_space #include namespace KokkosKernels { + +namespace Impl { + template + struct DefaultComparator + { + KOKKOS_INLINE_FUNCTION bool operator()(const Value lhs, const Value rhs) const + { + return lhs < rhs; + } + }; +} + +// ---------------------------------- +// CRS matrix/graph sorting utilities +// ---------------------------------- + +// The sort_crs* functions sort the adjacent column list for each row into ascending order. + +template +void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values); + +template +void sort_crs_matrix(const crsMat_t& A); + +template +void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries); + +// sort_and_merge_matrix produces a new matrix which is equivalent to A but is sorted +// and has no duplicated entries: each (i, j) is unique. Values for duplicated entries are summed. +template +crsMat_t sort_and_merge_matrix(const crsMat_t& A); + +// ---------------------------- +// General device-level sorting +// ---------------------------- + +// Bitonic sort: sorts v according to the comparator object's operator(). +// Default comparator is just operator< for v's element type. +template> +void bitonicSort(View v, const Comparator& comp = Comparator()); + +// -------------------------------------------------------- +// Serial sorting (callable inside any kernel or host code) +// -------------------------------------------------------- + +// Radix sort. Not in-place: requires scratch array 'valuesAux' to be the same size as values. +// ValueType must be an unsigned integer type. +template +KOKKOS_INLINE_FUNCTION void +SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n); + +// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts values[0...n]. +template +KOKKOS_INLINE_FUNCTION void +SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, PermType* permAux, Ordinal n); + +// ------------------------------------------------------------------- +// Team-level parallel sorting (callable inside any TeamPolicy kernel) +// ------------------------------------------------------------------- + +// Comparison based sorting that uses the entire team (described by mem) to sort raw array according to the comparator. +template> +KOKKOS_INLINE_FUNCTION void +TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()); + +// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts values[0...n]. +template> +KOKKOS_INLINE_FUNCTION void +TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()); + namespace Impl { +template +struct SortCrsMatrixFunctor +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using scalar_t = typename values_t::non_const_value_type; + using team_mem = typename Kokkos::TeamPolicy::member_type; + + SortCrsMatrixFunctor(bool usingRangePol, const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_) + : rowmap(rowmap_), entries(entries_), values(values_) + { + if(usingRangePol) + { + entriesAux = entries_t(Kokkos::ViewAllocateWithoutInitializing("Entries aux"), + entries.extent(0)); + valuesAux = values_t(Kokkos::ViewAllocateWithoutInitializing("Values aux"), + values.extent(0)); + } + //otherwise, aux arrays won't be allocated (sorting in place) + } + + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const + { + size_type rowStart = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowNum = rowEnd - rowStart; + //Radix sort requires unsigned keys for comparison + using unsigned_lno_t = typename std::make_unsigned::type; + KokkosKernels::SerialRadixSort2( + (unsigned_lno_t*) entries.data() + rowStart, + (unsigned_lno_t*) entriesAux.data() + rowStart, + values.data() + rowStart, + valuesAux.data() + rowStart, rowNum); + } + + KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const + { + size_type i = t.league_rank(); + size_type rowStart = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowNum = rowEnd - rowStart; + KokkosKernels::TeamBitonicSort2 + (entries.data() + rowStart, values.data() + rowStart, rowNum, t); + } + + rowmap_t rowmap; + entries_t entries; + entries_t entriesAux; + values_t values; + values_t valuesAux; +}; + +template +struct SortCrsGraphFunctor +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using team_mem = typename Kokkos::TeamPolicy::member_type; + + SortCrsGraphFunctor(bool usingRangePol, const rowmap_t& rowmap_, const entries_t& entries_) + : rowmap(rowmap_), entries(entries_) + { + if(usingRangePol) + { + entriesAux = entries_t(Kokkos::ViewAllocateWithoutInitializing("Entries aux"), + entries.extent(0)); + } + //otherwise, aux arrays won't be allocated (sorting in place) + } + + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const + { + size_type rowStart = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowNum = rowEnd - rowStart; + //Radix sort requires unsigned keys for comparison + using unsigned_lno_t = typename std::make_unsigned::type; + KokkosKernels::SerialRadixSort( + (unsigned_lno_t*) entries.data() + rowStart, + (unsigned_lno_t*) entriesAux.data() + rowStart, + rowNum); + } + + KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const + { + size_type i = t.league_rank(); + size_type rowStart = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowNum = rowEnd - rowStart; + KokkosKernels::TeamBitonicSort + (entries.data() + rowStart, rowNum, t); + } + + rowmap_t rowmap; + entries_t entries; + entries_t entriesAux; +}; + +template +struct MergedRowmapFunctor +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using c_rowmap_t = typename rowmap_t::const_type; + + //Precondition: entries are sorted within each row + MergedRowmapFunctor(const rowmap_t& mergedCounts_, const c_rowmap_t& rowmap_, const entries_t& entries_) + : mergedCounts(mergedCounts_), rowmap(rowmap_), entries(entries_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t row, size_type& lnewNNZ) const + { + size_type rowBegin = rowmap(row); + size_type rowEnd = rowmap(row + 1); + if(rowEnd == rowBegin) + { + //Row was empty to begin with + mergedCounts(row) = 0; + return; + } + //Otherwise, the first entry in the row exists + lno_t uniqueEntries = 1; + for(size_type j = rowBegin + 1; j < rowEnd; j++) + { + if(entries(j - 1) != entries(j)) + uniqueEntries++; + } + mergedCounts(row) = uniqueEntries; + lnewNNZ += uniqueEntries; + if(row == lno_t((rowmap.extent(0) - 1) - 1)) + mergedCounts(row + 1) = 0; + } + + rowmap_t mergedCounts; + c_rowmap_t rowmap; + entries_t entries; +}; + +template +struct MatrixMergedEntriesFunctor +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using scalar_t = typename values_t::non_const_value_type; + + //Precondition: entries are sorted within each row + MatrixMergedEntriesFunctor( + const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_, + const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_, const values_t& mergedValues_) + : rowmap(rowmap_), entries(entries_), values(values_), + mergedRowmap(mergedRowmap_), mergedEntries(mergedEntries_), mergedValues(mergedValues_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const + { + size_type rowBegin = rowmap(row); + size_type rowEnd = rowmap(row + 1); + if(rowEnd == rowBegin) + { + //Row was empty to begin with, nothing to do + return; + } + //Otherwise, accumulate the value for each column + scalar_t accumVal = values(rowBegin); + lno_t accumCol = entries(rowBegin); + size_type insertPos = mergedRowmap(row); + for(size_type j = rowBegin + 1; j < rowEnd; j++) + { + if(accumCol == entries(j)) + { + //accumulate + accumVal += values(j); + } + else + { + //write out and reset + mergedValues(insertPos) = accumVal; + mergedEntries(insertPos) = accumCol; + insertPos++; + accumVal = values(j); + accumCol = entries(j); + } + } + //always left with the last unique entry + mergedValues(insertPos) = accumVal; + mergedEntries(insertPos) = accumCol; + } + + rowmap_t rowmap; + entries_t entries; + values_t values; + rowmap_t mergedRowmap; + entries_t mergedEntries; + values_t mergedValues; +}; + +template +struct GraphMergedEntriesFunctor +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + + //Precondition: entries are sorted within each row + GraphMergedEntriesFunctor( + const rowmap_t& rowmap_, const entries_t& entries_, + const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_) + : rowmap(rowmap_), entries(entries_), + mergedRowmap(mergedRowmap_), mergedEntries(mergedEntries_) + {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const + { + size_type rowBegin = rowmap(row); + size_type rowEnd = rowmap(row + 1); + if(rowEnd == rowBegin) + { + //Row was empty to begin with, nothing to do + return; + } + //Otherwise, accumulate the value for each column + lno_t accumCol = entries(rowBegin); + size_type insertPos = mergedRowmap(row); + for(size_type j = rowBegin + 1; j < rowEnd; j++) + { + if(accumCol != entries(j)) + { + //write out and reset + mergedEntries(insertPos) = accumCol; + insertPos++; + accumCol = entries(j); + } + } + //always left with the last unique entry + mergedEntries(insertPos) = accumCol; + } + + rowmap_t rowmap; + entries_t entries; + rowmap_t mergedRowmap; + entries_t mergedEntries; +}; + +//Functor that sorts a view on one team +template +struct BitonicSingleTeamFunctor +{ + BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {} + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const + { + TeamBitonicSort(v.data(), v.extent(0), t, comp); + }; + View v; + Comparator comp; +}; + +//Functor that sorts equally sized chunks on each team +template +struct BitonicChunkFunctor +{ + BitonicChunkFunctor(View& v_, const Comparator& comp_, Ordinal chunkSize_) : v(v_), comp(comp_), chunkSize(chunkSize_) {} + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const + { + Ordinal chunk = t.league_rank(); + Ordinal chunkStart = chunk * chunkSize; + Ordinal n = chunkSize; + if(chunkStart + n > Ordinal(v.extent(0))) + n = v.extent(0) - chunkStart; + TeamBitonicSort(v.data() + chunkStart, n, t, comp); + }; + View v; + Comparator comp; + Ordinal chunkSize; +}; + +//Functor that does just the first phase (brown) of bitonic sort on equally-sized chunks +template +struct BitonicPhase1Functor +{ + typedef typename View::value_type Value; + BitonicPhase1Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, Ordinal teamsPerBox_) + : v(v_), comp(comp_), boxSize(boxSize_), teamsPerBox(teamsPerBox_) + {} + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const + { + Ordinal box = t.league_rank() / teamsPerBox; + Ordinal boxStart = boxSize * box; + Ordinal work = boxSize / teamsPerBox / 2; + Ordinal workStart = work * (t.league_rank() % teamsPerBox); + Ordinal workReflect = boxSize - workStart - 1; + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), + [&](const Ordinal i) + { + Ordinal elem1 = boxStart + workStart + i; + Ordinal elem2 = boxStart + workReflect - i; + if(elem2 < Ordinal(v.extent(0))) + { + if(comp(v(elem2), v(elem1))) + { + Value temp = v(elem1); + v(elem1) = v(elem2); + v(elem2) = temp; + } + } + }); + }; + View v; + Comparator comp; + Ordinal boxSize; + Ordinal teamsPerBox; +}; + +//Functor that does the second phase (red) of bitonic sort +template +struct BitonicPhase2Functor +{ + typedef typename View::value_type Value; + BitonicPhase2Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, Ordinal teamsPerBox_) + : v(v_), comp(comp_), boxSize(boxSize_), teamsPerBox(teamsPerBox_) + {} + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const + { + Ordinal logBoxSize = 1; + while((Ordinal(1) << logBoxSize) < boxSize) + logBoxSize++; + Ordinal box = t.league_rank() / teamsPerBox; + Ordinal boxStart = boxSize * box; + Ordinal work = boxSize / teamsPerBox / 2; + Ordinal workStart = boxStart + work * (t.league_rank() % teamsPerBox); + Ordinal jump = boxSize / 2; + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), + [&](const Ordinal i) + { + Ordinal elem1 = workStart + i; + Ordinal elem2 = workStart + jump + i; + if(elem2 < Ordinal(v.extent(0))) + { + if(comp(v(elem2), v(elem1))) + { + Value temp = v(elem1); + v(elem1) = v(elem2); + v(elem2) = temp; + } + } + }); + if(teamsPerBox == 1) + { + //This team can finish phase 2 for all the smaller red boxes that follow, + //since there are no longer cross-team data dependencies + for(Ordinal subLevel = 1; subLevel < logBoxSize; subLevel++) + { + t.team_barrier(); + Ordinal logSubBoxSize = logBoxSize - subLevel; + Ordinal subBoxSize = Ordinal(1) << logSubBoxSize; + Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), + [&](const Ordinal i) + { + Ordinal globalThread = i + t.league_rank() * work; + Ordinal subBox = globalThread >> (logSubBoxSize - 1); + Ordinal subBoxStart = subBox << logSubBoxSize; + Ordinal subBoxOffset = globalThread & ((Ordinal(1) << (logSubBoxSize - 1)) - 1); //i % (subBoxSize / 2) + Ordinal elem1 = subBoxStart + subBoxOffset; + //later phases (pink box): within a block, compare with fixed distance (boxSize / 2) apart + Ordinal elem2 = elem1 + subBoxSize / 2; + if(elem2 < Ordinal(v.extent(0))) + { + if(comp(v(elem2), v(elem1))) + { + Value temp = v(elem1); + v(elem1) = v(elem2); + v(elem2) = temp; + } + } + }); + } + } + }; + View v; + Comparator comp; + Ordinal boxSize; + Ordinal teamsPerBox; +}; + +} //namespace Impl + +// Sort a CRS matrix: within each row, sort entries ascending by column. +// At the same time, permute the values. +template +void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values) +{ + using lno_t = typename entries_t::non_const_value_type; + using team_pol = Kokkos::TeamPolicy; + bool useRadix = !Impl::kk_is_gpu_exec_space(); + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if(numRows == 0) + return; + Impl::SortCrsMatrixFunctor + funct(useRadix, rowmap, entries, values); + if(useRadix) + { + Kokkos::parallel_for("sort_crs_matrix", Kokkos::RangePolicy(0, numRows), funct); + } + else + { + //Try to get teamsize to be largest power of 2 not greater than avg entries per row + //TODO (probably important for performnce): add thread-level sort also, and use that + //for small avg degree. But this works for now. + lno_t idealTeamSize = 1; + lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; + while(idealTeamSize < avgDeg / 2) + { + idealTeamSize *= 2; + } + team_pol temp(numRows, 1); + lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); + lno_t teamSize = std::min(idealTeamSize, maxTeamSize); + Kokkos::parallel_for("sort_crs_matrix", team_pol(numRows, teamSize), funct); + } +} + +template +void sort_crs_matrix(const crsMat_t& A) +{ + //Note: rowmap_t has const values, but that's OK as sorting doesn't modify it + using rowmap_t = typename crsMat_t::row_map_type; + using entries_t = typename crsMat_t::index_type::non_const_type; + using values_t = typename crsMat_t::values_type::non_const_type; + using exec_space = typename crsMat_t::execution_space; + //NOTE: the rowmap of a StaticCrsGraph is const-valued, but the + //entries and CrsMatrix values are non-const (so sorting them directly + //is allowed) + sort_crs_matrix + (A.graph.row_map, A.graph.entries, A.values); +} + +// Sort a CRS graph: within each row, sort entries ascending by column. +template +void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) +{ + using lno_t = typename entries_t::non_const_value_type; + using team_pol = Kokkos::TeamPolicy; + bool useRadix = !Impl::kk_is_gpu_exec_space(); + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if(numRows == 0) + return; + Impl::SortCrsGraphFunctor + funct(useRadix, rowmap, entries); + if(useRadix) + { + Kokkos::parallel_for("sort_crs_graph", Kokkos::RangePolicy(0, numRows), funct); + } + else + { + //Try to get teamsize to be largest power of 2 less than or equal to + //half the entries per row. 0.5 * #entries is bitonic's parallelism within a row. + //TODO (probably important for performnce): add thread-level sort also, and use that + //for small avg degree. But this works for now. + lno_t idealTeamSize = 1; + lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; + while(idealTeamSize < avgDeg / 2) + { + idealTeamSize *= 2; + } + team_pol temp(numRows, 1); + lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); + lno_t teamSize = std::min(idealTeamSize, maxTeamSize); + Kokkos::parallel_for("sort_crs_graph", team_pol(numRows, teamSize), funct); + } +} + +//Sort the rows of matrix, and merge duplicate entries. +template +crsMat_t sort_and_merge_matrix(const crsMat_t& A) +{ + using c_rowmap_t = typename crsMat_t::row_map_type; + using rowmap_t = typename crsMat_t::row_map_type::non_const_type; + using entries_t = typename crsMat_t::index_type::non_const_type; + using values_t = typename crsMat_t::values_type::non_const_type; + using size_type = typename rowmap_t::non_const_value_type; + using exec_space = typename crsMat_t::execution_space; + using range_t = Kokkos::RangePolicy; + sort_crs_matrix(A); + //Count entries per row into a new rowmap, in terms of merges that can be done + rowmap_t mergedRowmap(Kokkos::ViewAllocateWithoutInitializing("SortedMerged rowmap"), A.numRows() + 1); + size_type numCompressedEntries = 0; + Kokkos::parallel_reduce(range_t(0, A.numRows()), + Impl::MergedRowmapFunctor(mergedRowmap, A.graph.row_map, A.graph.entries), numCompressedEntries); + //Prefix sum to get rowmap + Impl::kk_exclusive_parallel_prefix_sum(A.numRows() + 1, mergedRowmap); + entries_t mergedEntries("SortedMerged entries", numCompressedEntries); + values_t mergedValues("SortedMerged values", numCompressedEntries); + //Compute merged entries and values + Kokkos::parallel_for(range_t(0, A.numRows()), + Impl::MatrixMergedEntriesFunctor + (A.graph.row_map, A.graph.entries, A.values, + mergedRowmap, mergedEntries, mergedValues)); + //Finally, construct the new compressed matrix + return crsMat_t("SortedMerged", A.numRows(), A.numCols(), numCompressedEntries, + mergedValues, mergedRowmap, mergedEntries); +} + +template +void sort_and_merge_graph( + const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, + rowmap_t& rowmap_out, entries_t& entries_out) +{ + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using range_t = Kokkos::RangePolicy; + using const_rowmap_t = typename rowmap_t::const_type; + lno_t numRows = rowmap_in.extent(0); + if(numRows <= 1) + { + //Matrix has zero rows + rowmap_out = rowmap_t(); + entries_out = entries_t(); + return; + } + numRows--; + //Sort in place + sort_crs_graph(rowmap_in, entries_in); + //Count entries per row into a new rowmap, in terms of merges that can be done + rowmap_out = rowmap_t(Kokkos::ViewAllocateWithoutInitializing("SortedMerged rowmap"), numRows + 1); + size_type numCompressedEntries = 0; + Kokkos::parallel_reduce(range_t(0, numRows), + Impl::MergedRowmapFunctor(rowmap_out, rowmap_in, entries_in), numCompressedEntries); + //Prefix sum to get rowmap + Impl::kk_exclusive_parallel_prefix_sum(numRows + 1, rowmap_out); + entries_out = entries_t("SortedMerged entries", numCompressedEntries); + //Compute merged entries and values + Kokkos::parallel_for(range_t(0, numRows), + Impl::GraphMergedEntriesFunctor + (rowmap_in, entries_in, + rowmap_out, entries_out)); +} + +//Version to be called from host on a single array +//Generally ~2x slower than Kokkos::sort() for large arrays (> 50 M elements), +//but faster for smaller arrays. +// +//This is more general than BinSort: bitonic supports any trivially copyable type +//and an arbitrary device-compatible comparison operator (provided through operator() of Comparator) +//If comparator is void, use operator< (which should only be used for primitives) +template +void bitonicSort(View v, const Comparator& comp) +{ + typedef Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + Ordinal n = v.extent(0); + //If n is small, just sort on a single team + if(n <= Ordinal(1) << 12) + { + Kokkos::parallel_for(team_policy(1, Kokkos::AUTO()), + Impl::BitonicSingleTeamFunctor(v, comp)); + } + else + { + Ordinal npot = 1; + while(npot < n) + npot <<= 1; + //Partition the data equally among fixed number of teams + Ordinal chunkSize = 512; + Ordinal numTeams = npot / chunkSize; + //First, sort within teams + Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), + Impl::BitonicChunkFunctor(v, comp, chunkSize)); + for(int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2) + { + Ordinal boxSize = teamsPerBox * chunkSize; + Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), + Impl::BitonicPhase1Functor(v, comp, boxSize, teamsPerBox)); + for(int boxDiv = 1; teamsPerBox >> boxDiv; boxDiv++) + { + Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), + Impl::BitonicPhase2Functor(v, comp, boxSize >> boxDiv, teamsPerBox >> boxDiv)); + } + } + } +} + //Radix sort for integers, on a single thread within a team. //Pros: few diverging branches, so OK for sorting on a single GPU vector lane. Better on CPU cores. //Con: requires auxiliary storage, and this version only works for integers @@ -235,22 +884,13 @@ SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, PermTy } } -template -struct DefaultComparator -{ - KOKKOS_INLINE_FUNCTION bool operator()(const Value lhs, const Value rhs) const - { - return lhs < rhs; - } -}; - //Bitonic merge sort (requires only comparison operators and trivially-copyable) //Pros: In-place, plenty of parallelism for GPUs, and memory references are coalesced //Con: O(n log^2(n)) serial time is bad on CPUs //Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter -template> +template KOKKOS_INLINE_FUNCTION void -TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) +TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp) { //Algorithm only works on power-of-two input size only. //If n is not a power-of-two, will implicitly pretend @@ -313,9 +953,9 @@ TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, const Compar } //Sort "values", while applying the same swaps to "perm" -template> +template KOKKOS_INLINE_FUNCTION void -TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) +TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp) { //Algorithm only works on power-of-two input size only. //If n is not a power-of-two, will implicitly pretend @@ -383,191 +1023,21 @@ TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember } } -//Functor that sorts a view on one team -template -struct BitonicSingleTeamFunctor -{ - BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {} - KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const - { - TeamBitonicSort(v.data(), v.extent(0), t, comp); - }; - View v; - Comparator comp; -}; - -//Functor that sorts equally sized chunks on each team -template -struct BitonicChunkFunctor -{ - BitonicChunkFunctor(View& v_, const Comparator& comp_, Ordinal chunkSize_) : v(v_), comp(comp_), chunkSize(chunkSize_) {} - KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const - { - Ordinal chunk = t.league_rank(); - Ordinal chunkStart = chunk * chunkSize; - Ordinal n = chunkSize; - if(chunkStart + n > Ordinal(v.extent(0))) - n = v.extent(0) - chunkStart; - TeamBitonicSort(v.data() + chunkStart, n, t, comp); - }; - View v; - Comparator comp; - Ordinal chunkSize; -}; - -//Functor that does just the first phase (brown) of bitonic sort on equally-sized chunks -template -struct BitonicPhase1Functor +//For backward compatibility: keep the public interface accessible in KokkosKernels::Impl:: +namespace Impl { - typedef typename View::value_type Value; - BitonicPhase1Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, Ordinal teamsPerBox_) - : v(v_), comp(comp_), boxSize(boxSize_), teamsPerBox(teamsPerBox_) - {} - KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const - { - Ordinal box = t.league_rank() / teamsPerBox; - Ordinal boxStart = boxSize * box; - Ordinal work = boxSize / teamsPerBox / 2; - Ordinal workStart = work * (t.league_rank() % teamsPerBox); - Ordinal workReflect = boxSize - workStart - 1; - Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), - [&](const Ordinal i) - { - Ordinal elem1 = boxStart + workStart + i; - Ordinal elem2 = boxStart + workReflect - i; - if(elem2 < Ordinal(v.extent(0))) - { - if(comp(v(elem2), v(elem1))) - { - Value temp = v(elem1); - v(elem1) = v(elem2); - v(elem2) = temp; - } - } - }); - }; - View v; - Comparator comp; - Ordinal boxSize; - Ordinal teamsPerBox; -}; - -//Functor that does the second phase (red) of bitonic sort -template -struct BitonicPhase2Functor -{ - typedef typename View::value_type Value; - BitonicPhase2Functor(View& v_, const Comparator& comp_, Ordinal boxSize_, Ordinal teamsPerBox_) - : v(v_), comp(comp_), boxSize(boxSize_), teamsPerBox(teamsPerBox_) - {} - KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const - { - Ordinal logBoxSize = 1; - while((Ordinal(1) << logBoxSize) < boxSize) - logBoxSize++; - Ordinal box = t.league_rank() / teamsPerBox; - Ordinal boxStart = boxSize * box; - Ordinal work = boxSize / teamsPerBox / 2; - Ordinal workStart = boxStart + work * (t.league_rank() % teamsPerBox); - Ordinal jump = boxSize / 2; - Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), - [&](const Ordinal i) - { - Ordinal elem1 = workStart + i; - Ordinal elem2 = workStart + jump + i; - if(elem2 < Ordinal(v.extent(0))) - { - if(comp(v(elem2), v(elem1))) - { - Value temp = v(elem1); - v(elem1) = v(elem2); - v(elem2) = temp; - } - } - }); - if(teamsPerBox == 1) - { - //This team can finish phase 2 for all the smaller red boxes that follow, - //since there are no longer cross-team data dependencies - for(Ordinal subLevel = 1; subLevel < logBoxSize; subLevel++) - { - t.team_barrier(); - Ordinal logSubBoxSize = logBoxSize - subLevel; - Ordinal subBoxSize = Ordinal(1) << logSubBoxSize; - Kokkos::parallel_for(Kokkos::TeamThreadRange(t, work), - [&](const Ordinal i) - { - Ordinal globalThread = i + t.league_rank() * work; - Ordinal subBox = globalThread >> (logSubBoxSize - 1); - Ordinal subBoxStart = subBox << logSubBoxSize; - Ordinal subBoxOffset = globalThread & ((Ordinal(1) << (logSubBoxSize - 1)) - 1); //i % (subBoxSize / 2) - Ordinal elem1 = subBoxStart + subBoxOffset; - //later phases (pink box): within a block, compare with fixed distance (boxSize / 2) apart - Ordinal elem2 = elem1 + subBoxSize / 2; - if(elem2 < Ordinal(v.extent(0))) - { - if(comp(v(elem2), v(elem1))) - { - Value temp = v(elem1); - v(elem1) = v(elem2); - v(elem2) = temp; - } - } - }); - } - } - }; - View v; - Comparator comp; - Ordinal boxSize; - Ordinal teamsPerBox; -}; - -//Version to be called from host on a single array -//Generally ~2x slower than Kokkos::sort() for large arrays (> 50 M elements), -//but faster for smaller arrays. -// -//This is more general than BinSort: bitonic supports any trivially copyable type -//and an arbitrary device-compatible comparison operator (provided through operator() of Comparator) -//If comparator is void, use operator< (which should only be used for primitives) -template> -void bitonicSort(View v, const Comparator& comp = Comparator()) -{ - typedef Kokkos::TeamPolicy team_policy; - typedef typename team_policy::member_type team_member; - Ordinal n = v.extent(0); - //If n is small, just sort on a single team - if(n <= Ordinal(1) << 12) - { - Kokkos::parallel_for(team_policy(1, Kokkos::AUTO()), - BitonicSingleTeamFunctor(v, comp)); - } - else - { - Ordinal npot = 1; - while(npot < n) - npot <<= 1; - //Partition the data equally among fixed number of teams - Ordinal chunkSize = 512; - Ordinal numTeams = npot / chunkSize; - //First, sort within teams - Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), - BitonicChunkFunctor(v, comp, chunkSize)); - for(int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2) - { - Ordinal boxSize = teamsPerBox * chunkSize; - Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), - BitonicPhase1Functor(v, comp, boxSize, teamsPerBox)); - for(int boxDiv = 1; teamsPerBox >> boxDiv; boxDiv++) - { - Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), - BitonicPhase2Functor(v, comp, boxSize >> boxDiv, teamsPerBox >> boxDiv)); - } - } - } + using KokkosKernels::sort_crs_graph; + using KokkosKernels::sort_crs_matrix; + using KokkosKernels::sort_and_merge_graph; + using KokkosKernels::sort_and_merge_matrix; + using KokkosKernels::bitonicSort; + using KokkosKernels::SerialRadixSort; + using KokkosKernels::SerialRadixSort2; + using KokkosKernels::TeamBitonicSort; + using KokkosKernels::TeamBitonicSort2; } -}} +} #endif diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp index 85763608ec..8d9ef345c5 100644 --- a/src/common/KokkosKernels_SparseUtils.hpp +++ b/src/common/KokkosKernels_SparseUtils.hpp @@ -51,7 +51,6 @@ #include "KokkosKernels_ExecSpaceUtils.hpp" #include #include "KokkosKernels_PrintUtils.hpp" -#include "KokkosKernels_Sorting.hpp" #ifdef KOKKOSKERNELS_HAVE_PARALLEL_GNUSORT #include @@ -936,453 +935,6 @@ void graph_min_max_degree(const rowmap_t& rowmap, ordinal_t& min_degree, ordinal max_degree = result.max_val; } -template -struct SortCrsMatrixFunctor -{ - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using scalar_t = typename values_t::non_const_value_type; - using team_mem = typename Kokkos::TeamPolicy::member_type; - - SortCrsMatrixFunctor(bool usingRangePol, const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_) - : rowmap(rowmap_), entries(entries_), values(values_) - { - if(usingRangePol) - { - entriesAux = entries_t(Kokkos::ViewAllocateWithoutInitializing("Entries aux"), - entries.extent(0)); - valuesAux = values_t(Kokkos::ViewAllocateWithoutInitializing("Values aux"), - values.extent(0)); - } - //otherwise, aux arrays won't be allocated (sorting in place) - } - - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const - { - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - //Radix sort requires unsigned keys for comparison - using unsigned_lno_t = typename std::make_unsigned::type; - KokkosKernels::Impl::SerialRadixSort2( - (unsigned_lno_t*) entries.data() + rowStart, - (unsigned_lno_t*) entriesAux.data() + rowStart, - values.data() + rowStart, - valuesAux.data() + rowStart, rowNum); - } - - KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const - { - size_type i = t.league_rank(); - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - KokkosKernels::Impl::TeamBitonicSort2 - (entries.data() + rowStart, values.data() + rowStart, rowNum, t); - } - - rowmap_t rowmap; - entries_t entries; - entries_t entriesAux; - values_t values; - values_t valuesAux; -}; - -template -struct SortCrsGraphFunctor -{ - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using team_mem = typename Kokkos::TeamPolicy::member_type; - - SortCrsGraphFunctor(bool usingRangePol, const rowmap_t& rowmap_, const entries_t& entries_) - : rowmap(rowmap_), entries(entries_) - { - if(usingRangePol) - { - entriesAux = entries_t(Kokkos::ViewAllocateWithoutInitializing("Entries aux"), - entries.extent(0)); - } - //otherwise, aux arrays won't be allocated (sorting in place) - } - - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const - { - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - //Radix sort requires unsigned keys for comparison - using unsigned_lno_t = typename std::make_unsigned::type; - KokkosKernels::Impl::SerialRadixSort( - (unsigned_lno_t*) entries.data() + rowStart, - (unsigned_lno_t*) entriesAux.data() + rowStart, - rowNum); - } - - KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const - { - size_type i = t.league_rank(); - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - KokkosKernels::Impl::TeamBitonicSort - (entries.data() + rowStart, rowNum, t); - } - - rowmap_t rowmap; - entries_t entries; - entries_t entriesAux; -}; - -// Sort a CRS matrix: within each row, sort entries ascending by column. -// At the same time, permute the values. -template -void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values) -{ - using lno_t = typename entries_t::non_const_value_type; - using team_pol = Kokkos::TeamPolicy; - bool useRadix = !kk_is_gpu_exec_space(); - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; - if(numRows == 0) - return; - SortCrsMatrixFunctor - funct(useRadix, rowmap, entries, values); - if(useRadix) - { - Kokkos::parallel_for("sort_crs_matrix", Kokkos::RangePolicy(0, numRows), funct); - } - else - { - //Try to get teamsize to be largest power of 2 not greater than avg entries per row - //TODO (probably important for performnce): add thread-level sort also, and use that - //for small avg degree. But this works for now. - lno_t idealTeamSize = 1; - lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; - while(idealTeamSize < avgDeg / 2) - { - idealTeamSize *= 2; - } - team_pol temp(numRows, 1); - lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); - lno_t teamSize = std::min(idealTeamSize, maxTeamSize); - Kokkos::parallel_for("sort_crs_matrix", team_pol(numRows, teamSize), funct); - } -} - -template -void sort_crs_matrix(const crsMat_t& A) -{ - //Note: rowmap_t has const values, but that's OK as sorting doesn't modify it - using rowmap_t = typename crsMat_t::row_map_type; - using entries_t = typename crsMat_t::index_type::non_const_type; - using values_t = typename crsMat_t::values_type::non_const_type; - using exec_space = typename crsMat_t::execution_space; - //NOTE: the rowmap of a StaticCrsGraph is const-valued, but the - //entries and CrsMatrix values are non-const (so sorting them directly - //is allowed) - sort_crs_matrix - (A.graph.row_map, A.graph.entries, A.values); -} - -// Sort a CRS graph: within each row, sort entries ascending by column. -template -void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) -{ - using lno_t = typename entries_t::non_const_value_type; - using team_pol = Kokkos::TeamPolicy; - bool useRadix = !kk_is_gpu_exec_space(); - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; - if(numRows == 0) - return; - SortCrsGraphFunctor - funct(useRadix, rowmap, entries); - if(useRadix) - { - Kokkos::parallel_for("sort_crs_graph", Kokkos::RangePolicy(0, numRows), funct); - } - else - { - //Try to get teamsize to be largest power of 2 less than or equal to - //half the entries per row. 0.5 * #entries is bitonic's parallelism within a row. - //TODO (probably important for performnce): add thread-level sort also, and use that - //for small avg degree. But this works for now. - lno_t idealTeamSize = 1; - lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; - while(idealTeamSize < avgDeg / 2) - { - idealTeamSize *= 2; - } - team_pol temp(numRows, 1); - lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); - lno_t teamSize = std::min(idealTeamSize, maxTeamSize); - Kokkos::parallel_for("sort_crs_graph", team_pol(numRows, teamSize), funct); - } -} - -template -struct MergedRowmapFunctor -{ - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using c_rowmap_t = typename rowmap_t::const_type; - - //Precondition: entries are sorted within each row - MergedRowmapFunctor(const rowmap_t& mergedCounts_, const c_rowmap_t& rowmap_, const entries_t& entries_) - : mergedCounts(mergedCounts_), rowmap(rowmap_), entries(entries_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t row, size_type& lnewNNZ) const - { - size_type rowBegin = rowmap(row); - size_type rowEnd = rowmap(row + 1); - if(rowEnd == rowBegin) - { - //Row was empty to begin with - mergedCounts(row) = 0; - return; - } - //Otherwise, the first entry in the row exists - lno_t uniqueEntries = 1; - for(size_type j = rowBegin + 1; j < rowEnd; j++) - { - if(entries(j - 1) != entries(j)) - uniqueEntries++; - } - mergedCounts(row) = uniqueEntries; - lnewNNZ += uniqueEntries; - if(row == lno_t((rowmap.extent(0) - 1) - 1)) - mergedCounts(row + 1) = 0; - } - - rowmap_t mergedCounts; - c_rowmap_t rowmap; - entries_t entries; -}; - -template -struct MatrixMergedEntriesFunctor -{ - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using scalar_t = typename values_t::non_const_value_type; - - //Precondition: entries are sorted within each row - MatrixMergedEntriesFunctor( - const rowmap_t& rowmap_, const entries_t& entries_, const values_t& values_, - const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_, const values_t& mergedValues_) - : rowmap(rowmap_), entries(entries_), values(values_), - mergedRowmap(mergedRowmap_), mergedEntries(mergedEntries_), mergedValues(mergedValues_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const - { - size_type rowBegin = rowmap(row); - size_type rowEnd = rowmap(row + 1); - if(rowEnd == rowBegin) - { - //Row was empty to begin with, nothing to do - return; - } - //Otherwise, accumulate the value for each column - scalar_t accumVal = values(rowBegin); - lno_t accumCol = entries(rowBegin); - size_type insertPos = mergedRowmap(row); - for(size_type j = rowBegin + 1; j < rowEnd; j++) - { - if(accumCol == entries(j)) - { - //accumulate - accumVal += values(j); - } - else - { - //write out and reset - mergedValues(insertPos) = accumVal; - mergedEntries(insertPos) = accumCol; - insertPos++; - accumVal = values(j); - accumCol = entries(j); - } - } - //always left with the last unique entry - mergedValues(insertPos) = accumVal; - mergedEntries(insertPos) = accumCol; - } - - rowmap_t rowmap; - entries_t entries; - values_t values; - rowmap_t mergedRowmap; - entries_t mergedEntries; - values_t mergedValues; -}; - -template -struct GraphMergedEntriesFunctor -{ - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - - //Precondition: entries are sorted within each row - GraphMergedEntriesFunctor( - const rowmap_t& rowmap_, const entries_t& entries_, - const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_) - : rowmap(rowmap_), entries(entries_), - mergedRowmap(mergedRowmap_), mergedEntries(mergedEntries_) - {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const - { - size_type rowBegin = rowmap(row); - size_type rowEnd = rowmap(row + 1); - if(rowEnd == rowBegin) - { - //Row was empty to begin with, nothing to do - return; - } - //Otherwise, accumulate the value for each column - lno_t accumCol = entries(rowBegin); - size_type insertPos = mergedRowmap(row); - for(size_type j = rowBegin + 1; j < rowEnd; j++) - { - if(accumCol != entries(j)) - { - //write out and reset - mergedEntries(insertPos) = accumCol; - insertPos++; - accumCol = entries(j); - } - } - //always left with the last unique entry - mergedEntries(insertPos) = accumCol; - } - - rowmap_t rowmap; - entries_t entries; - rowmap_t mergedRowmap; - entries_t mergedEntries; -}; - -//Sort the rows of matrix, and merge duplicate entries. -template -crsMat_t sort_and_merge_matrix(const crsMat_t& A) -{ - using c_rowmap_t = typename crsMat_t::row_map_type; - using rowmap_t = typename crsMat_t::row_map_type::non_const_type; - using entries_t = typename crsMat_t::index_type::non_const_type; - using values_t = typename crsMat_t::values_type::non_const_type; - using size_type = typename rowmap_t::non_const_value_type; - using exec_space = typename crsMat_t::execution_space; - using range_t = Kokkos::RangePolicy; - sort_crs_matrix(A); - //Count entries per row into a new rowmap, in terms of merges that can be done - rowmap_t mergedRowmap(Kokkos::ViewAllocateWithoutInitializing("SortedMerged rowmap"), A.numRows() + 1); - size_type numCompressedEntries = 0; - Kokkos::parallel_reduce(range_t(0, A.numRows()), - MergedRowmapFunctor(mergedRowmap, A.graph.row_map, A.graph.entries), numCompressedEntries); - //Prefix sum to get rowmap - kk_exclusive_parallel_prefix_sum(A.numRows() + 1, mergedRowmap); - entries_t mergedEntries("SortedMerged entries", numCompressedEntries); - values_t mergedValues("SortedMerged values", numCompressedEntries); - //Compute merged entries and values - Kokkos::parallel_for(range_t(0, A.numRows()), - MatrixMergedEntriesFunctor - (A.graph.row_map, A.graph.entries, A.values, - mergedRowmap, mergedEntries, mergedValues)); - //Finally, construct the new compressed matrix - return crsMat_t("SortedMerged", A.numRows(), A.numCols(), numCompressedEntries, - mergedValues, mergedRowmap, mergedEntries); -} - -template -void sort_and_merge_graph( - const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, - rowmap_t& rowmap_out, entries_t& entries_out) -{ - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using range_t = Kokkos::RangePolicy; - using const_rowmap_t = typename rowmap_t::const_type; - lno_t numRows = rowmap_in.extent(0); - if(numRows <= 1) - { - //Matrix has zero rows - rowmap_out = rowmap_t(); - entries_out = entries_t(); - return; - } - numRows--; - //Sort in place - sort_crs_graph(rowmap_in, entries_in); - //Count entries per row into a new rowmap, in terms of merges that can be done - rowmap_out = rowmap_t(Kokkos::ViewAllocateWithoutInitializing("SortedMerged rowmap"), numRows + 1); - size_type numCompressedEntries = 0; - Kokkos::parallel_reduce(range_t(0, numRows), - MergedRowmapFunctor(rowmap_out, rowmap_in, entries_in), numCompressedEntries); - //Prefix sum to get rowmap - kk_exclusive_parallel_prefix_sum(numRows + 1, rowmap_out); - entries_out = entries_t("SortedMerged entries", numCompressedEntries); - //Compute merged entries and values - Kokkos::parallel_for(range_t(0, numRows), - GraphMergedEntriesFunctor - (rowmap_in, entries_in, - rowmap_out, entries_out)); -} - -template -void kk_sort_graph( - lno_view_t in_xadj, - lno_nnz_view_t in_adj, - scalar_view_t in_vals, - - out_nnz_view_t out_adj, - out_scalar_view_t out_vals){ - // TODO BMK: can this function be deprecated? - typename lno_view_t::HostMirror hr = Kokkos::create_mirror_view (in_xadj); - Kokkos::deep_copy (hr, in_xadj); - typename lno_nnz_view_t::HostMirror he = Kokkos::create_mirror_view (in_adj); - Kokkos::deep_copy (he, in_adj); - typename scalar_view_t::HostMirror hv = Kokkos::create_mirror_view (in_vals); - Kokkos::deep_copy (hv, in_vals); - MyExecSpace().fence(); - - typename lno_nnz_view_t::HostMirror heo = Kokkos::create_mirror_view (out_adj); - typename scalar_view_t::HostMirror hvo = Kokkos::create_mirror_view (out_vals); - - typedef typename lno_view_t::non_const_value_type size_type; - typedef typename lno_nnz_view_t::non_const_value_type lno_t; - typedef typename scalar_view_t::non_const_value_type scalar_t; - - lno_t nrows = in_xadj.extent(0) - 1; - std::vector > edges(in_adj.extent(0)); - - size_type row_size = 0; - for (lno_t i = 0; i < nrows; ++i){ - for (size_type j = hr(i); j < hr(i + 1); ++j){ - edges[row_size].src = i; - edges[row_size].dst = he(j); - edges[row_size++].ew = hv(j); - } - } - std::sort (edges.begin(), edges.begin() + row_size); - size_type ne = in_adj.extent(0); - for(size_type i = 0; i < ne; ++i){ - heo(i) = edges[i].dst; - hvo(i) = edges[i].ew; - } - - - Kokkos::deep_copy (out_adj, heo); - Kokkos::deep_copy (out_vals, hvo); - MyExecSpace().fence(); -} - /* template -void kk_create_incidence_matrix_from_lower_triangle( - typename cols_view_t::non_const_value_type nr, - row_map_view_t in_lower_rowmap, - cols_view_t in_lower_entries, - out_row_map_view_t &out_rowmap, - out_cols_view_t &out_entries, - bool use_dynamic_scheduling = false, - bool chunksize = 4){ - - //typedef typename row_map_view_t::const_type const_row_map_view_t; - //typedef typename cols_view_t::const_type const_cols_view_t; - - typedef typename row_map_view_t::non_const_value_type size_type; - typedef typename cols_view_t::non_const_value_type lno_t; - - - - //const size_type *rowmap = in_rowmap.data(); - //const lno_t *entries= in_entries.data(); - const size_type ne = in_lower_entries.extent(0); - typedef Kokkos::RangePolicy my_exec_space; - out_rowmap = out_row_map_view_t("LL", nr+1); - - Kokkos::parallel_for(my_exec_space(0, ne), - KOKKOS_LAMBDA(const lno_t& i) { - typedef typename std::remove_reference< decltype( out_rowmap[0] ) >::type atomic_incr_type; - Kokkos::atomic_fetch_add(&(out_rowmap[in_lower_entries[i]]), atomic_incr_type(1)); - }); - - exec_space().fence(); - kk_exclusive_parallel_prefix_sum(nr+1, out_rowmap); - - exec_space().fence(); - Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromLowerTriangle::S0", my_exec_space(0, nr + 1), - KOKKOS_LAMBDA(const lno_t& i) { - out_rowmap[i] += in_lower_rowmap[i]; - }); - - out_row_map_view_t out_rowmap_copy (Kokkos::ViewAllocateWithoutInitializing("tmp"), nr+1); - Kokkos::deep_copy(out_rowmap_copy, out_rowmap); - - out_entries = out_cols_view_t(Kokkos::ViewAllocateWithoutInitializing("LL"), 2*ne); - - Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromLowerTriangle::S1", my_exec_space(0, nr), - KOKKOS_LAMBDA(const size_type& row) { - size_type begin = in_lower_rowmap(row); - lno_t row_size = in_lower_rowmap(row + 1) - begin; - - - - size_type out_begin = out_rowmap_copy(row); - out_rowmap_copy(row) += row_size; - //lno_t row_size = out_rowmap_copy(row + 1) - begin; - - for (int i = 0; i < row_size; ++i){ - size_type edge_ind = i + begin; - out_entries[out_begin + i] = edge_ind; - } - }); - exec_space().fence(); - Kokkos::parallel_for("KokkosKernels::Common::CreateIncidenceTransposeMatrixFromLowerTriangle::S2", my_exec_space(0, ne), - KOKKOS_LAMBDA(const size_type& edge_ind) { - lno_t col = in_lower_entries[edge_ind]; - typedef typename std::remove_reference< decltype( out_rowmap_copy(0) ) >::type atomic_incr_type; - size_type write_ind = Kokkos::atomic_fetch_add(&(out_rowmap_copy(col)), atomic_incr_type(1)); - out_entries[write_ind] = edge_ind; - }); - - out_cols_view_t tmp ("kk", ne * 2); - out_cols_view_t outcols ("kk", ne * 2); - - kk_sort_graph - (out_rowmap, out_entries, - tmp, - - outcols, - tmp); - - out_entries = outcols; -} - - template - (out_rowmap, out_entries, - tmp, - - outcols, - tmp); - - out_entries = outcols;*/ - } +} diff --git a/src/graph/KokkosGraph_ExplicitCoarsening.hpp b/src/graph/KokkosGraph_ExplicitCoarsening.hpp index def892a167..c380cadc64 100644 --- a/src/graph/KokkosGraph_ExplicitCoarsening.hpp +++ b/src/graph/KokkosGraph_ExplicitCoarsening.hpp @@ -46,7 +46,7 @@ #define KOKKOSGRAPH_EXPLICIT_COARSEN_HPP #include "KokkosGraph_ExplicitCoarsening_impl.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosKernels_Sorting.hpp" namespace KokkosGraph { namespace Experimental { @@ -78,7 +78,7 @@ void graph_explicit_coarsen( { coarse_rowmap_t mergedRowmap; coarse_entries_t mergedEntries; - KokkosKernels::Impl::sort_and_merge_graph + KokkosKernels::sort_and_merge_graph (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); coarseRowmap = mergedRowmap; coarseEntries = mergedEntries; @@ -109,7 +109,7 @@ void graph_explicit_coarsen_with_inverse_map( { coarse_rowmap_t mergedRowmap; coarse_entries_t mergedEntries; - KokkosKernels::Impl::sort_and_merge_graph + KokkosKernels::sort_and_merge_graph (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); coarseRowmap = mergedRowmap; coarseEntries = mergedEntries; diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp index 1efae2c1a7..537aa779f6 100644 --- a/src/sparse/KokkosSparse_spadd.hpp +++ b/src/sparse/KokkosSparse_spadd.hpp @@ -377,7 +377,7 @@ void spadd_symbolic( "KokkosSparse::SpAdd:Symbolic::InputNotSorted::UnmergedSum", range_type(0, nrows), unmergedSum); // sort the unmerged sum - KokkosKernels::Impl::sort_crs_matrix( c_rowmap_upperbound, c_entries_uncompressed, ab_perm); clno_nnz_view_t_ a_pos(NoInitialize("A entry positions"), diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index d5c111862f..9ab7140725 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -50,12 +50,12 @@ #include #include #include -#include #include #include "KokkosGraph_Distance1Color.hpp" #include "KokkosKernels_Uniform_Initialized_MemoryPool.hpp" #include "KokkosKernels_BitUtils.hpp" #include "KokkosKernels_SimpleUtils.hpp" +#include "KokkosKernels_Sorting.hpp" //FOR DEBUGGING #include "KokkosBlas1_nrm2.hpp" @@ -738,7 +738,7 @@ namespace KokkosSparse{ // TODO BMK: Why are the vertices in each color set only being sorted on GPU? // Wouldn't it have a locality benefit on CPU too? if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { - KokkosKernels::Impl::sort_crs_graph(color_xadj, color_adj); + KokkosKernels::sort_crs_graph(color_xadj, color_adj); MyExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "SORT_TIME:" << timer.seconds() << std::endl; diff --git a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index 7b2db4ba9a..4682bb50c0 100644 --- a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -845,9 +845,9 @@ namespace KokkosSparse{ if (sptrsv_algo == SPTRSVAlgorithm::SPTRSV_CUSPARSE) { // symbolic with CuSparse needs values // CuSparse needs matrix sorted by column indexes for each row // TODO: may need to move this to symbolic/numeric of sptrsv - KokkosKernels::Impl::sort_crs_matrix + KokkosKernels::sort_crs_matrix (rowmap_viewL, column_viewL, values_viewL); - KokkosKernels::Impl::sort_crs_matrix + KokkosKernels::sort_crs_matrix (rowmap_viewU, column_viewU, values_viewU); // now do symbolic diff --git a/test_common/KokkosKernels_MatrixConverter.cpp b/test_common/KokkosKernels_MatrixConverter.cpp index 41fb5ebc2c..4148452fa8 100644 --- a/test_common/KokkosKernels_MatrixConverter.cpp +++ b/test_common/KokkosKernels_MatrixConverter.cpp @@ -45,6 +45,7 @@ #include #include "KokkosKernels_IOUtils.hpp" #include "KokkosKernels_Utils.hpp" +#include "KokkosKernels_Sorting.hpp" #include #include "KokkosSparse_CrsMatrix.hpp" @@ -223,13 +224,8 @@ int main (int argc, char* argv[]){ (numrows, orm, oentries, new_rowmap, new_entries); values_view_t new_values("new_values",new_entries.extent(0)); - cols_view_t out_adj ("out_adj", new_entries.extent(0)); - values_view_t out_vals("out_vals", new_entries.extent(0)); - - KokkosKernels::Impl::kk_sort_graph - (new_rowmap, new_entries, new_values, out_adj, out_vals); - new_entries = out_adj; - new_values = out_vals; + KokkosKernels::sort_crs_matrix + (new_rowmap, new_entries, new_values); graph_t symmetric_graph(new_entries, new_rowmap); crstmat_t symmetric_marix("transpose", numrows, new_values, symmetric_graph); @@ -260,17 +256,15 @@ int main (int argc, char* argv[]){ new_rowmap, new_entries, new_values); std::cout << 1 << std::endl; - cols_view_t out_adj ("out_adj", new_entries.extent(0)); - values_view_t out_vals("out_vals", new_entries.extent(0)); std::cout << 2 << std::endl; - KokkosKernels::Impl::kk_sort_graph - (new_rowmap, new_entries, new_values, out_adj, out_vals); - new_entries = out_adj; - new_values = out_vals; + + KokkosKernels::sort_crs_matrix + (new_rowmap, new_entries, new_values); + std::cout << 3 << std::endl; MyExecSpace().fence(); - KokkosKernels::Impl::kk_print_1Dview(out_adj); - KokkosKernels::Impl::kk_print_1Dview(out_vals); + KokkosKernels::Impl::kk_print_1Dview(new_entries); + KokkosKernels::Impl::kk_print_1Dview(new_values); graph_t transpose_graph(new_entries, new_rowmap); crstmat_t transpose_matrix("transpose", a_crsmat.numRows(), new_values, transpose_graph); diff --git a/unit_test/common/Test_Common_Sorting.hpp b/unit_test/common/Test_Common_Sorting.hpp index 732ee4b451..877950f681 100644 --- a/unit_test/common/Test_Common_Sorting.hpp +++ b/unit_test/common/Test_Common_Sorting.hpp @@ -603,13 +603,13 @@ void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type //call the actual sort routine being tested if(doValues) { - KokkosKernels::Impl::sort_crs_matrix + KokkosKernels::sort_crs_matrix (A.graph.row_map, A.graph.entries, A.values); } else { - KokkosKernels::Impl::sort_crs_graph + KokkosKernels::sort_crs_graph (A.graph.row_map, A.graph.entries); } @@ -674,7 +674,7 @@ void testSortAndMerge() Kokkos::deep_copy(devInValues, hostInValues); crsMat_t input("Input", nrows, ncols, nnz, devInValues, devInRowmap, devInEntries); - crsMat_t output = KokkosKernels::Impl::sort_and_merge_matrix(input); + crsMat_t output = KokkosKernels::sort_and_merge_matrix(input); exec_space().fence(); EXPECT_EQ(output.numRows(), nrows); EXPECT_EQ(output.numCols(), ncols); diff --git a/unit_test/common/Test_Common_Transpose.hpp b/unit_test/common/Test_Common_Transpose.hpp index 9319afd6fe..905d466adb 100644 --- a/unit_test/common/Test_Common_Transpose.hpp +++ b/unit_test/common/Test_Common_Transpose.hpp @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -137,9 +138,8 @@ void testTranspose(int numRows, int numCols, bool doValues) tt_rowmap, tt_entries); } //Sort both the transpose-transpose, and the original matrix (to compare directly) - KokkosKernels::Impl::sort_crs_matrix - (input_mat.graph.row_map, input_mat.graph.entries, input_mat.values); - KokkosKernels::Impl::sort_crs_matrix + KokkosKernels::sort_crs_matrix(input_mat); + KokkosKernels::sort_crs_matrix (tt_rowmap, tt_entries, tt_values); //The views should now be exactly identical, since they represent the same matrix and are sorted size_type rowmapDiffs; diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp index c0776f17e7..96f29957aa 100644 --- a/unit_test/sparse/Test_Sparse_spgemm.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm.hpp @@ -47,6 +47,7 @@ #include #include "KokkosKernels_SparseUtils.hpp" +#include "KokkosKernels_Sorting.hpp" #include #include #include @@ -202,27 +203,6 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference){ size_t nentries_reference = output_mat_reference.graph.entries.extent(0) ; size_t nvals_reference = output_mat_reference.values.extent(0); - - lno_nnz_view_t h_ent_actual (Kokkos::ViewAllocateWithoutInitializing("h_ent_actual"), nentries_actual); - scalar_view_t h_vals_actual (Kokkos::ViewAllocateWithoutInitializing("h_vals_actual"), nvals_actual); - - - KokkosKernels::Impl::kk_sort_graph( - output_mat_actual.graph.row_map, - output_mat_actual.graph.entries, - output_mat_actual.values, - h_ent_actual, h_vals_actual - ); - - lno_nnz_view_t h_ent_reference (Kokkos::ViewAllocateWithoutInitializing("h_ent_reference"), nentries_reference); - scalar_view_t h_vals_reference (Kokkos::ViewAllocateWithoutInitializing("h_vals_reference"), nvals_reference); - if (nrows_actual != nrows_reference) { std::cout << "nrows_actual:" << nrows_actual << " nrows_reference:" << nrows_reference << std::endl; return false; @@ -236,19 +216,8 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference){ return false; } - KokkosKernels::Impl::kk_sort_graph - ( - output_mat_reference.graph.row_map, - output_mat_reference.graph.entries, - output_mat_reference.values, - h_ent_reference, h_vals_reference - ); + KokkosKernels::sort_crs_matrix(output_mat_actual); + KokkosKernels::sort_crs_matrix(output_mat_reference); bool is_identical = true; is_identical = KokkosKernels::Impl::kk_is_identical_view @@ -266,12 +235,12 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference){ is_identical = KokkosKernels::Impl::kk_is_identical_view (h_ent_actual, h_ent_reference, 0 ); + typename device::execution_space>(output_mat_actual.graph.entries, output_mat_reference.graph.entries, 0 ); if (!is_identical) { std::cout << "entries are different." << std::endl; - KokkosKernels::Impl::kk_print_1Dview(h_ent_actual); - KokkosKernels::Impl::kk_print_1Dview(h_ent_reference); + KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.graph.entries); + KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.graph.entries); return false; } @@ -282,7 +251,7 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference){ is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view (h_vals_actual, h_vals_reference, eps); + typename device::execution_space>(output_mat_actual.values, output_mat_reference.values, eps); if (!is_identical) { std::cout << "values are different." << std::endl; diff --git a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp index 7332fa99cc..7a2652bbb2 100644 --- a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp @@ -47,6 +47,7 @@ #include #include "KokkosKernels_SparseUtils.hpp" +#include "KokkosKernels_Sorting.hpp" #include #include #include @@ -177,24 +178,7 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2){ size_t nentries2 = output_mat2.graph.entries.extent(0) ; size_t nvals2 = output_mat2.values.extent(0); - - lno_nnz_view_t h_ent1 (Kokkos::ViewAllocateWithoutInitializing("e1"), nentries1); - scalar_view_t h_vals1 (Kokkos::ViewAllocateWithoutInitializing("v1"), nvals1); - - - KokkosKernels::Impl::kk_sort_graph( - output_mat1.graph.row_map, output_mat1.graph.entries, output_mat1.values, - h_ent1, h_vals1 - ); - - lno_nnz_view_t h_ent2 (Kokkos::ViewAllocateWithoutInitializing("e1"), nentries2); - scalar_view_t h_vals2 (Kokkos::ViewAllocateWithoutInitializing("v1"), nvals2); + KokkosKernels::sort_crs_matrix(output_mat1); if (nrows1 != nrows2) { std::cout << "nrows1:" << nrows1 << " nrows2:" << nrows2 << std::endl; @@ -209,17 +193,7 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2){ return false; } - KokkosKernels::Impl::kk_sort_graph - ( - output_mat2.graph.row_map, output_mat2.graph.entries, output_mat2.values, - h_ent2, h_vals2 - ); + KokkosKernels::sort_crs_matrix(output_mat2); bool is_identical = true; is_identical = KokkosKernels::Impl::kk_is_identical_view @@ -235,12 +209,12 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2){ is_identical = KokkosKernels::Impl::kk_is_identical_view (h_ent1, h_ent2, 0 ); + typename device::execution_space>(output_mat1.graph.entries, output_mat2.graph.entries, 0 ); if (!is_identical) { std::cout << "entries are different." << std::endl; - KokkosKernels::Impl::kk_print_1Dview(h_ent1); - KokkosKernels::Impl::kk_print_1Dview(h_ent2); + KokkosKernels::Impl::kk_print_1Dview(output_mat1.graph.entries); + KokkosKernels::Impl::kk_print_1Dview(output_mat2.graph.entries); return false; } @@ -250,12 +224,12 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2){ is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view (h_vals1, h_vals2, eps); + typename device::execution_space>(output_mat1.values, output_mat2.values, eps); if (!is_identical) { std::cout << "values are different for eps: " << eps << std::endl; - KokkosKernels::Impl::kk_print_1Dview(h_vals1); - KokkosKernels::Impl::kk_print_1Dview(h_vals2); + KokkosKernels::Impl::kk_print_1Dview(output_mat1.values); + KokkosKernels::Impl::kk_print_1Dview(output_mat2.values); return false; } From b95c7cf94b5ecdf32f425c8dba4d586210ee784a Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 9 Apr 2021 12:33:10 -0600 Subject: [PATCH 2/3] Fix unused typedef warnings --- perf_test/sparse/KokkosSparse_multimem_spgemm.hpp | 15 --------------- .../sparse/KokkosSparse_run_spgemm_jacobi.hpp | 11 ----------- 2 files changed, 26 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp index f31b611795..79fc0dbe87 100644 --- a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp +++ b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp @@ -58,22 +58,7 @@ namespace Experiment{ typedef Kokkos::Device mySlowExecSpace; typedef typename KokkosSparse::CrsMatrix fast_crstmat_t; - //typedef typename fast_crstmat_t::StaticCrsGraphType fast_graph_t; - //typedef typename fast_crstmat_t::row_map_type::non_const_type fast_row_map_view_t; - typedef typename fast_crstmat_t::index_type::non_const_type fast_cols_view_t; - typedef typename fast_crstmat_t::values_type::non_const_type fast_values_view_t; - typedef typename fast_crstmat_t::row_map_type::const_type const_fast_row_map_view_t; - typedef typename fast_crstmat_t::index_type::const_type const_fast_cols_view_t; - typedef typename fast_crstmat_t::values_type::const_type const_fast_values_view_t; - typedef typename KokkosSparse::CrsMatrix slow_crstmat_t; - //typedef typename slow_crstmat_t::StaticCrsGraphType slow_graph_t; - //typedef typename slow_crstmat_t::row_map_type::non_const_type slow_row_map_view_t; - typedef typename slow_crstmat_t::index_type::non_const_type slow_cols_view_t; - typedef typename slow_crstmat_t::values_type::non_const_type slow_values_view_t; - typedef typename slow_crstmat_t::row_map_type::const_type const_slow_row_map_view_t; - typedef typename slow_crstmat_t::index_type::const_type const_slow_cols_view_t; - typedef typename slow_crstmat_t::values_type::const_type const_slow_values_view_t; char *a_mat_file = params.a_mtx_bin_file; char *b_mat_file = params.b_mtx_bin_file; diff --git a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp index 8e50f3e879..5a8d7f3f13 100644 --- a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp +++ b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp @@ -307,18 +307,7 @@ namespace KokkosKernels{ typedef Kokkos::Device mySlowExecSpace; typedef typename KokkosSparse::CrsMatrix fast_crstmat_t; - typedef typename fast_crstmat_t::index_type::non_const_type fast_cols_view_t; - typedef typename fast_crstmat_t::values_type::non_const_type fast_values_view_t; - typedef typename fast_crstmat_t::row_map_type::const_type const_fast_row_map_view_t; - typedef typename fast_crstmat_t::index_type::const_type const_fast_cols_view_t; - typedef typename fast_crstmat_t::values_type::const_type const_fast_values_view_t; - typedef typename KokkosSparse::CrsMatrix slow_crstmat_t; - typedef typename slow_crstmat_t::index_type::non_const_type slow_cols_view_t; - typedef typename slow_crstmat_t::values_type::non_const_type slow_values_view_t; - typedef typename slow_crstmat_t::row_map_type::const_type const_slow_row_map_view_t; - typedef typename slow_crstmat_t::index_type::const_type const_slow_cols_view_t; - typedef typename slow_crstmat_t::values_type::const_type const_slow_values_view_t; char *a_mat_file = params.a_mtx_bin_file; char *b_mat_file = params.b_mtx_bin_file; From 0c5499e5e1bc1a332b47185bb349511ed7099a27 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 23 Apr 2021 10:45:47 -0600 Subject: [PATCH 3/3] Add StaticCrsGraph sorting interfaces and deprecate KokkosKernels::Impl:: sorting functions --- perf_test/sparse/KokkosSparse_run_spgemm.hpp | 8 +- src/common/KokkosKernels_Sorting.hpp | 129 +++++++++++++++++-- unit_test/common/Test_Common_Sorting.hpp | 64 +++++---- 3 files changed, 164 insertions(+), 37 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_run_spgemm.hpp b/perf_test/sparse/KokkosSparse_run_spgemm.hpp index d04eb7104d..87b3761c1e 100644 --- a/perf_test/sparse/KokkosSparse_run_spgemm.hpp +++ b/perf_test/sparse/KokkosSparse_run_spgemm.hpp @@ -100,11 +100,11 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2){ typename device::execution_space>(output_mat1.graph.entries, output_mat2.graph.entries, 0 ); if (!is_identical) { for (size_t i = 0; i < nrows1; ++i){ - size_t rb = output_mat1.graph.row_map[i]; - size_t re = output_mat1.graph.row_map[i + 1]; + size_t rb = output_mat1.graph.row_map(i); + size_t re = output_mat1.graph.row_map(i + 1); bool incorrect =false; for (size_t j = rb; j < re; ++j){ - if (output_mat1.graph.entries[j] != output_mat2.graph.entries[j]){ + if (output_mat1.graph.entries(j) != output_mat2.graph.entries(j)){ incorrect = true; break; } @@ -112,7 +112,7 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2){ if (incorrect){ for (size_t j = rb; j < re; ++j){ std::cerr << "row:" << i << " j:" << j << - " h_ent1[j]:" << output_mat1.graph.entries(j) << " h_ent2[j]:" << output_mat2.graph.entries[j] << + " h_ent1(j):" << output_mat1.graph.entries(j) << " h_ent2(j):" << output_mat2.graph.entries(j) << " rb:" << rb << " re:" << re << std::endl; } } diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp index 0bfc1289b8..49b5dc3430 100644 --- a/src/common/KokkosKernels_Sorting.hpp +++ b/src/common/KokkosKernels_Sorting.hpp @@ -77,11 +77,22 @@ void sort_crs_matrix(const crsMat_t& A); template void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries); +template +void sort_crs_graph(const crsGraph_t& G); + // sort_and_merge_matrix produces a new matrix which is equivalent to A but is sorted // and has no duplicated entries: each (i, j) is unique. Values for duplicated entries are summed. template crsMat_t sort_and_merge_matrix(const crsMat_t& A); +template +crsGraph_t sort_and_merge_graph(const crsGraph_t& G); + +template +void sort_and_merge_graph( + const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, + rowmap_t& rowmap_out, entries_t& entries_out); + // ---------------------------- // General device-level sorting // ---------------------------- @@ -371,7 +382,7 @@ struct BitonicSingleTeamFunctor BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {} KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { - TeamBitonicSort(v.data(), v.extent(0), t, comp); + KokkosKernels::TeamBitonicSort(v.data(), v.extent(0), t, comp); }; View v; Comparator comp; @@ -389,7 +400,7 @@ struct BitonicChunkFunctor Ordinal n = chunkSize; if(chunkStart + n > Ordinal(v.extent(0))) n = v.extent(0) - chunkStart; - TeamBitonicSort(v.data() + chunkStart, n, t, comp); + KokkosKernels::TeamBitonicSort(v.data() + chunkStart, n, t, comp); }; View v; Comparator comp; @@ -591,6 +602,17 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) } } +template +void sort_crs_graph(const crsGraph_t& G) +{ + static_assert(!std::is_const::value, + "sort_crs_graph requires StaticCrsGraph entries to be non-const."); + sort_crs_graph< + typename crsGraph_t::execution_space, + typename crsGraph_t::row_map_type, typename crsGraph_t::entries_type> + (G.row_map, G.entries); +} + //Sort the rows of matrix, and merge duplicate entries. template crsMat_t sort_and_merge_matrix(const crsMat_t& A) @@ -657,6 +679,19 @@ void sort_and_merge_graph( rowmap_out, entries_out)); } +template +crsGraph_t sort_and_merge_graph(const crsGraph_t& G) +{ + using rowmap_t = typename crsGraph_t::row_map_type::non_const_type; + using entries_t = typename crsGraph_t::entries_type; + static_assert(!std::is_const::value, + "sort_and_merge_graph requires StaticCrsGraph entries to be non-const."); + rowmap_t mergedRowmap; + entries_t mergedEntries; + sort_and_merge_graph(G.row_map, G.entries, mergedRowmap, mergedEntries); + return crsGraph_t(mergedEntries, mergedRowmap); +} + //Version to be called from host on a single array //Generally ~2x slower than Kokkos::sort() for large arrays (> 50 M elements), //but faster for smaller arrays. @@ -1026,15 +1061,87 @@ TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember //For backward compatibility: keep the public interface accessible in KokkosKernels::Impl:: namespace Impl { - using KokkosKernels::sort_crs_graph; - using KokkosKernels::sort_crs_matrix; - using KokkosKernels::sort_and_merge_graph; - using KokkosKernels::sort_and_merge_matrix; - using KokkosKernels::bitonicSort; - using KokkosKernels::SerialRadixSort; - using KokkosKernels::SerialRadixSort2; - using KokkosKernels::TeamBitonicSort; - using KokkosKernels::TeamBitonicSort2; + template + [[deprecated]] + void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) + { + KokkosKernels::sort_crs_graph(rowmap, entries); + } + + template + [[deprecated]] + void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values) + { + KokkosKernels::sort_crs_matrix(rowmap, entries, values); + } + + template + [[deprecated]] + void sort_crs_matrix(const crsMat_t& A) + { + KokkosKernels::sort_crs_matrix(A); + } + + template + [[deprecated]] + void sort_and_merge_graph( + const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, + rowmap_t& rowmap_out, entries_t& entries_out) + { + KokkosKernels::sort_and_merge_graph(rowmap_in, entries_in, rowmap_out, entries_out); + } + + template + [[deprecated]] + crsMat_t sort_and_merge_matrix(const crsMat_t& A) + { + KokkosKernels::sort_and_merge_matrix(A); + } + + template> + [[deprecated]] + void bitonicSort(View v, const Comparator& comp = Comparator()) + { + KokkosKernels::bitonicSort(v, comp); + } + + template + [[deprecated]] + KOKKOS_INLINE_FUNCTION + void + SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n) + { + KokkosKernels::SerialRadixSort(values, valuesAux, n); + } + + // Same as SerialRadixSort, but also permutes perm[0...n] as it sorts values[0...n]. + template + [[deprecated]] + KOKKOS_INLINE_FUNCTION + void + SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, PermType* permAux, Ordinal n) + { + KokkosKernels::SerialRadixSort2(values, valuesAux, perm, permAux, n); + } + + template> + [[deprecated]] + KOKKOS_INLINE_FUNCTION + void + TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) + { + KokkosKernels::TeamBitonicSort(values, n, mem, comp); + } + + // Same as SerialRadixSort, but also permutes perm[0...n] as it sorts values[0...n]. + template> + [[deprecated]] + KOKKOS_INLINE_FUNCTION + void + TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) + { + KokkosKernels::TeamBitonicSort2(values, perm, n, mem, comp); + } } } diff --git a/unit_test/common/Test_Common_Sorting.hpp b/unit_test/common/Test_Common_Sorting.hpp index 877950f681..b6e9de00b4 100644 --- a/unit_test/common/Test_Common_Sorting.hpp +++ b/unit_test/common/Test_Common_Sorting.hpp @@ -184,7 +184,7 @@ struct TestSerialRadixFunctor KOKKOS_INLINE_FUNCTION void operator()(const int i) const { int off = offsets(i); - KokkosKernels::Impl::SerialRadixSort( + KokkosKernels::SerialRadixSort( (UnsignedKey*) keys.data() + off, (UnsignedKey*) keysAux.data() + off, counts(i)); } KeyView keys; @@ -207,7 +207,7 @@ struct TestSerialRadix2Functor KOKKOS_INLINE_FUNCTION void operator()(const int i) const { int off = offsets(i); - KokkosKernels::Impl::SerialRadixSort2( + KokkosKernels::SerialRadixSort2( (UnsignedKey*) keys.data() + off, (UnsignedKey*) keysAux.data() + off, values.data() + off, valuesAux.data() + off, counts(i)); } @@ -321,7 +321,7 @@ struct TestTeamBitonicFunctor KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { int i = t.league_rank(); - KokkosKernels::Impl::TeamBitonicSort(values.data() + offsets(i), counts(i), t); + KokkosKernels::TeamBitonicSort(values.data() + offsets(i), counts(i), t); } ValView values; @@ -343,7 +343,7 @@ struct TestTeamBitonic2Functor KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { int i = t.league_rank(); - KokkosKernels::Impl::TeamBitonicSort2(keys.data() + offsets(i), values.data() + offsets(i), counts(i), t); + KokkosKernels::TeamBitonicSort2(keys.data() + offsets(i), values.data() + offsets(i), counts(i), t); } KeyView keys; @@ -458,7 +458,7 @@ void testBitonicSort(size_t n) typedef Kokkos::View ValView; ValView data("Bitonic sort testing data", n); fillRandom(data); - KokkosKernels::Impl::bitonicSort(data); + KokkosKernels::bitonicSort(data); int ordered = 1; Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckSortedFunctor(data), Kokkos::Min(ordered)); @@ -501,7 +501,7 @@ void testBitonicSortDescending() size_t n = 12521; ValView data("Bitonic sort testing data", n); fillRandom(data); - KokkosKernels::Impl::bitonicSort(data); + KokkosKernels::bitonicSort(data); int ordered = 1; Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckOrderedFunctor(data), Kokkos::Min(ordered)); @@ -536,7 +536,7 @@ void testBitonicSortLexicographic() size_t n = 9521; ValView data("Bitonic sort testing data", n); fillRandom(data); - KokkosKernels::Impl::bitonicSort(data); + KokkosKernels::bitonicSort(data); int ordered = 1; Kokkos::parallel_reduce(Kokkos::RangePolicy(0, n - 1), CheckOrderedFunctor(data), Kokkos::Min(ordered)); @@ -544,7 +544,7 @@ void testBitonicSortLexicographic() } template -void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type nnz, bool doValues) +void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type nnz, bool doValues, bool doStructInterface) { using scalar_t = default_scalar; using lno_t = default_lno_t; @@ -603,15 +603,29 @@ void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type //call the actual sort routine being tested if(doValues) { - KokkosKernels::sort_crs_matrix - - (A.graph.row_map, A.graph.entries, A.values); + if(doStructInterface) + { + KokkosKernels::sort_crs_matrix(A); + } + else + { + KokkosKernels::sort_crs_matrix + + (A.graph.row_map, A.graph.entries, A.values); + } } else { - KokkosKernels::sort_crs_graph - - (A.graph.row_map, A.graph.entries); + if(doStructInterface) + { + KokkosKernels::sort_crs_graph(A.graph); + } + else + { + KokkosKernels::sort_crs_graph + + (A.graph.row_map, A.graph.entries); + } } //Copy to host and compare Kokkos::View entriesOut("sorted entries host", nnz); @@ -774,20 +788,26 @@ TEST_F( TestCategory, common_device_bitonic) { } TEST_F( TestCategory, common_sort_crsgraph) { - testSortCRS(10, 10, 20, false); - testSortCRS(100, 100, 2000, false); - testSortCRS(1000, 1000, 30000, false); + for(int doStructInterface = 0; doStructInterface < 2; doStructInterface++) + { + testSortCRS(10, 10, 20, false, doStructInterface); + testSortCRS(100, 100, 2000, false, doStructInterface); + testSortCRS(1000, 1000, 30000, false, doStructInterface); + } } TEST_F( TestCategory, common_sort_crsmatrix) { - testSortCRS(10, 10, 20, true); - testSortCRS(100, 100, 2000, true); - testSortCRS(1000, 1000, 30000, true); + for(int doStructInterface = 0; doStructInterface < 2; doStructInterface++) + { + testSortCRS(10, 10, 20, true, doStructInterface); + testSortCRS(100, 100, 2000, true, doStructInterface); + testSortCRS(1000, 1000, 30000, true, doStructInterface); + } } TEST_F( TestCategory, common_sort_crs_longrows) { - testSortCRS(1, 50000, 10000, false); - testSortCRS(1, 50000, 10000, true); + testSortCRS(1, 50000, 10000, false, false); + testSortCRS(1, 50000, 10000, true, false); } TEST_F( TestCategory, common_sort_merge_crsmatrix) {