Skip to content

Commit

Permalink
Final cleanup, formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
brian-kelley committed Aug 1, 2024
1 parent a0a0b41 commit 92267d9
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 80 deletions.
10 changes: 6 additions & 4 deletions perf_test/sparse/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,12 @@ KOKKOSKERNELS_ADD_EXECUTABLE(
SOURCES KokkosSparse_mdf.cpp
)

KOKKOSKERNELS_ADD_EXECUTABLE(
sparse_sort_crs
SOURCES KokkosSparse_sort_crs.cpp
)
# Do not build this CRS sorting perf test by default.
# It can be enabled if needed by uncommenting these lines.
#KOKKOSKERNELS_ADD_EXECUTABLE(
# sparse_sort_crs
# SOURCES KokkosSparse_sort_crs.cpp
#)

if (KokkosKernels_ENABLE_BENCHMARK)
KOKKOSKERNELS_ADD_BENCHMARK(
Expand Down
9 changes: 9 additions & 0 deletions sparse/impl/KokkosSparse_sort_crs_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,15 @@ void applyPermutationBlockValues(const ExecSpace& exec,
out(i) = in(permutation(blockIndex) * scalarsPerBlock + offsetInBlock);
});
}

// Heuristic for choosing bulk sorting algorithm
template <typename Ordinal>
bool useBulkSortHeuristic(Ordinal avgDeg, Ordinal maxDeg) {
// Use bulk sort if matrix is highly imbalanced,
// OR the longest rows have many entries.
return (maxDeg / 10 > avgDeg) || (maxDeg > 1024);
}

} // namespace Impl
} // namespace KokkosSparse

Expand Down
109 changes: 63 additions & 46 deletions sparse/src/KokkosSparse_SortCrs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#define _KOKKOSSPARSE_SORTCRS_HPP

#include "KokkosSparse_sort_crs_impl.hpp"
#include "KokkosSparse_Utils.hpp"
#include "KokkosKernels_Sorting.hpp"

namespace KokkosSparse {
Expand Down Expand Up @@ -64,15 +65,17 @@ void sort_crs_matrix(
"sort_crs_matrix: entries_t must not be const-valued");
static_assert(!std::is_const_v<typename values_t::value_type>,
"sort_crs_matrix: value_t must not be const-valued");
using Ordinal = typename entries_t::non_const_value_type;
using Scalar = typename values_t::non_const_value_type;
using Offset = typename rowmap_t::non_const_value_type;
using Ordinal = typename entries_t::non_const_value_type;
using Scalar = typename values_t::non_const_value_type;
// This early return condition covers having 0 or 1 entries,
// which is also implied by having 0 rows or 0 columns.
// If only 1 entry, the matrix is already sorted.
if (entries.extent(0) <= size_t(1)) {
return;
}
Ordinal numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0;
if (numRows == 0) return;
if constexpr (!KokkosKernels::Impl::kk_is_gpu_exec_space<execution_space>()) {
using Offset = typename rowmap_t::non_const_value_type;
using UnsignedOrdinal = typename std::make_unsigned<Ordinal>::type;
using entries_managed_t = Kokkos::View<typename entries_t::data_type,
typename entries_t::device_type>;
Expand Down Expand Up @@ -100,21 +103,46 @@ void sort_crs_matrix(
values.data() + rowStart, valuesAux.data() + rowStart, rowNum);
});
} else {
// On GPUs, prefer to do a single bulk sort.
if (numCols == Kokkos::ArithTraits<Ordinal>::max()) {
KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0), entries,
numCols);
numCols++;
// On GPUs:
// If the matrix is highly imbalanced, or has long rows AND the dimensions
// are not too large to do one large bulk sort, do that. Otherwise, sort
// using one Kokkos thread per row.
Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows;
Ordinal maxDeg = KokkosSparse::Impl::graph_max_degree(exec, rowmap);
bool useBulkSort = false;
if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) {
// Calculate the true number of columns if user didn't pass it in
if (numCols == Kokkos::ArithTraits<Ordinal>::max()) {
KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0),
entries, numCols);
numCols++;
}
uint64_t maxBulkKey = (uint64_t)numRows * (uint64_t)numCols;
useBulkSort = maxBulkKey / numRows == (uint64_t)numCols;
}
uint64_t maxBulkKey = (uint64_t)numRows * (uint64_t)numCols;
if (maxBulkKey / numRows != (uint64_t)numCols) {
using Offset = typename rowmap_t::non_const_value_type;
if (useBulkSort) {
auto permutation = KokkosSparse::Impl::computeEntryPermutation(
exec, rowmap, entries, numCols);
// Permutations cannot be done in-place
Kokkos::View<typename values_t::value_type*, execution_space> origValues(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "origValues"),
values.extent(0));
Kokkos::View<typename entries_t::value_type*, execution_space>
origEntries(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "origEntries"),
entries.extent(0));
Kokkos::deep_copy(exec, origValues, values);
Kokkos::deep_copy(exec, origEntries, entries);
KokkosSparse::Impl::applyPermutation(exec, permutation, origEntries,
entries);
KokkosSparse::Impl::applyPermutation(exec, permutation, origValues,
values);
} else {
using TeamPol = Kokkos::TeamPolicy<execution_space>;
using TeamMem = typename TeamPol::member_type;
// Can't use bulk sort approach as matrix dimensions are too large.
// Fall back to parallel thread-level sort within each row.
Ordinal vectorLength = 1;
Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows;
while (vectorLength < avgDeg / 2) {
vectorLength *= 2;
}
Expand All @@ -130,23 +158,6 @@ void sort_crs_matrix(
TeamPol(exec, (numRows + teamSize - 1) / teamSize,
teamSize, vectorLength),
funct);
} else {
auto permutation = KokkosSparse::Impl::computeEntryPermutation(
exec, rowmap, entries, numCols);
// Permutations cannot be done in-place
Kokkos::View<typename values_t::value_type*, execution_space> origValues(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "origValues"),
values.extent(0));
Kokkos::View<typename entries_t::value_type*, execution_space>
origEntries(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "origEntries"),
entries.extent(0));
Kokkos::deep_copy(exec, origValues, values);
Kokkos::deep_copy(exec, origEntries, entries);
KokkosSparse::Impl::applyPermutation(exec, permutation, origEntries,
entries);
KokkosSparse::Impl::applyPermutation(exec, permutation, origValues,
values);
}
}
}
Expand Down Expand Up @@ -263,6 +274,7 @@ void sort_crs_graph(
typename entries_t::non_const_value_type numCols =
Kokkos::ArithTraits<typename entries_t::non_const_value_type>::max()) {
using Ordinal = typename entries_t::non_const_value_type;
using Offset = typename rowmap_t::non_const_value_type;
static_assert(
Kokkos::SpaceAccessibility<execution_space,
typename rowmap_t::memory_space>::accessible,
Expand All @@ -286,7 +298,6 @@ void sort_crs_graph(
using entries_managed_t = Kokkos::View<typename entries_t::data_type,
typename entries_t::device_type>;
using UnsignedOrdinal = typename std::make_unsigned<Ordinal>::type;
using Offset = typename rowmap_t::non_const_value_type;
entries_managed_t entriesAux(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"),
entries.extent(0));
Expand All @@ -304,22 +315,32 @@ void sort_crs_graph(
(UnsignedOrdinal*)entriesAux.data() + rowStart, rowNum);
});
} else {
// On GPU, prefer to sort all entries in one bulk sort.
// This is only possible if (row, col) pairs can be numbered using uint64_t
if (numCols == Kokkos::ArithTraits<Ordinal>::max()) {
KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0), entries,
numCols);
numCols++;
// On GPUs:
// If the graph is highly imbalanced AND the dimensions are not too large
// to do one large bulk sort, do that. Otherwise, sort using one Kokkos
// thread per row.
Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows;
Ordinal maxDeg = KokkosSparse::Impl::graph_max_degree(exec, rowmap);
bool useBulkSort = false;
if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) {
// Calculate the true number of columns if user didn't pass it in
if (numCols == Kokkos::ArithTraits<Ordinal>::max()) {
KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0),
entries, numCols);
numCols++;
}
uint64_t maxBulkKey = (uint64_t)numRows * (uint64_t)numCols;
useBulkSort = maxBulkKey / numRows == (uint64_t)numCols;
}
uint64_t maxBulkKey = (uint64_t)numRows * (uint64_t)numCols;
// Check if the multiplication above overflowed
if (maxBulkKey / numRows != (uint64_t)numCols) {
using Offset = typename rowmap_t::non_const_value_type;
if (useBulkSort) {
auto keys = KokkosSparse::Impl::generateBulkCrsKeys(exec, rowmap, entries,
numCols);
Kokkos::Experimental::sort_by_key(exec, keys, entries);
} else {
using TeamPol = Kokkos::TeamPolicy<execution_space>;
using TeamMem = typename TeamPol::member_type;
// Fall back to thread-level sort within each row
Ordinal vectorLength = 1;
Ordinal avgDeg = (entries.extent(0) + numRows - 1) / numRows;
while (vectorLength < avgDeg / 2) {
vectorLength *= 2;
}
Expand All @@ -335,10 +356,6 @@ void sort_crs_graph(
TeamPol(exec, (numRows + teamSize - 1) / teamSize,
teamSize, vectorLength),
funct);
} else {
auto keys = KokkosSparse::Impl::generateBulkCrsKeys(exec, rowmap, entries,
numCols);
Kokkos::Experimental::sort_by_key(exec, keys, entries);
}
}
}
Expand Down
23 changes: 19 additions & 4 deletions sparse/src/KokkosSparse_Utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -981,6 +981,21 @@ ordinal_t graph_max_degree(const rowmap_t &rowmap) {
return val;
}

template <typename execution_space, typename rowmap_t>
typename rowmap_t::non_const_value_type graph_max_degree(
const execution_space &exec, const rowmap_t &rowmap) {
using Offset = typename rowmap_t::non_const_value_type;
using Reducer = Kokkos::Max<Offset>;
Offset nrows = rowmap.extent(0);
if (nrows) nrows--;
if (nrows == 0) return 0;
Offset val;
Kokkos::parallel_reduce(Kokkos::RangePolicy<execution_space>(exec, 0, nrows),
MaxDegreeFunctor<Reducer, Offset, rowmap_t>(rowmap),
Reducer(val));
return val;
}

template <typename device_t, typename ordinal_t, typename rowmap_t>
void graph_min_max_degree(const rowmap_t &rowmap, ordinal_t &min_degree,
ordinal_t &max_degree) {
Expand Down Expand Up @@ -1625,8 +1640,8 @@ void kk_create_incidence_matrix_from_original_matrix(
lno_t col_perm = col;
if (perm) col_perm = perm[col];
if (row_perm > col_perm) {
typedef typename std::remove_reference<decltype(
out_rowmap_copy[0])>::type atomic_incr_type;
typedef typename std::remove_reference<
decltype(out_rowmap_copy[0])>::type atomic_incr_type;
size_type row_write_index = Kokkos::atomic_fetch_add(
&(out_rowmap_copy[row]), atomic_incr_type(1));
size_type col_write_index = Kokkos::atomic_fetch_add(
Expand Down Expand Up @@ -1658,8 +1673,8 @@ void kk_create_incidence_matrix_from_original_matrix(
lno_t col_perm = col;
if (perm) col_perm = perm[col];
if (row_perm < col_perm) {
typedef typename std::remove_reference<decltype(
out_rowmap_copy[0])>::type atomic_incr_type;
typedef typename std::remove_reference<
decltype(out_rowmap_copy[0])>::type atomic_incr_type;
size_type row_write_index = Kokkos::atomic_fetch_add(
&(out_rowmap_copy[row]), atomic_incr_type(1));
size_type col_write_index = Kokkos::atomic_fetch_add(
Expand Down
52 changes: 26 additions & 26 deletions sparse/unit_test/Test_Sparse.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,33 +16,33 @@
#ifndef TEST_SPARSE_HPP
#define TEST_SPARSE_HPP

//#include "Test_Sparse_coo2crs.hpp"
//#include "Test_Sparse_crs2coo.hpp"
//#include "Test_Sparse_Controls.hpp"
//#include "Test_Sparse_CrsMatrix.hpp"
//#include "Test_Sparse_mdf.hpp"
//#include "Test_Sparse_findRelOffset.hpp"
//#include "Test_Sparse_gauss_seidel.hpp"
//#include "Test_Sparse_MergeMatrix.hpp"
//#include "Test_Sparse_replaceSumInto.hpp"
//#include "Test_Sparse_replaceSumIntoLonger.hpp"
//#include "Test_Sparse_spadd.hpp"
//#include "Test_Sparse_spgemm_jacobi.hpp"
//#include "Test_Sparse_spgemm.hpp"
#include "Test_Sparse_coo2crs.hpp"
#include "Test_Sparse_crs2coo.hpp"
#include "Test_Sparse_Controls.hpp"
#include "Test_Sparse_CrsMatrix.hpp"
#include "Test_Sparse_mdf.hpp"
#include "Test_Sparse_findRelOffset.hpp"
#include "Test_Sparse_gauss_seidel.hpp"
#include "Test_Sparse_MergeMatrix.hpp"
#include "Test_Sparse_replaceSumInto.hpp"
#include "Test_Sparse_replaceSumIntoLonger.hpp"
#include "Test_Sparse_spadd.hpp"
#include "Test_Sparse_spgemm_jacobi.hpp"
#include "Test_Sparse_spgemm.hpp"
#include "Test_Sparse_SortCrs.hpp"
//#include "Test_Sparse_spiluk.hpp"
//#include "Test_Sparse_spmv.hpp"
//#include "Test_Sparse_sptrsv.hpp"
//#include "Test_Sparse_trsv.hpp"
//#include "Test_Sparse_par_ilut.hpp"
//#include "Test_Sparse_gmres.hpp"
//#include "Test_Sparse_Transpose.hpp"
//#include "Test_Sparse_TestUtils_RandCsMat.hpp"
//#include "Test_Sparse_IOUtils.hpp"
//#include "Test_Sparse_ccs2crs.hpp"
//#include "Test_Sparse_crs2ccs.hpp"
//#include "Test_Sparse_removeCrsMatrixZeros.hpp"
//#include "Test_Sparse_extractCrsDiagonalBlocks.hpp"
#include "Test_Sparse_spiluk.hpp"
#include "Test_Sparse_spmv.hpp"
#include "Test_Sparse_sptrsv.hpp"
#include "Test_Sparse_trsv.hpp"
#include "Test_Sparse_par_ilut.hpp"
#include "Test_Sparse_gmres.hpp"
#include "Test_Sparse_Transpose.hpp"
#include "Test_Sparse_TestUtils_RandCsMat.hpp"
#include "Test_Sparse_IOUtils.hpp"
#include "Test_Sparse_ccs2crs.hpp"
#include "Test_Sparse_crs2ccs.hpp"
#include "Test_Sparse_removeCrsMatrixZeros.hpp"
#include "Test_Sparse_extractCrsDiagonalBlocks.hpp"

// TPL specific tests, these require
// particular pairs of backend and TPL
Expand Down

0 comments on commit 92267d9

Please sign in to comment.