Skip to content

Commit

Permalink
Merge Pull Request #9958 from ndellingwood/Trilinos/kokkos-snapshot
Browse files Browse the repository at this point in the history
Automatically Merged using Trilinos Pull Request AutoTester
PR Title: Kokkos snapshot
PR Author: ndellingwood
  • Loading branch information
trilinos-autotester authored Nov 23, 2021
2 parents 04cc1ad + 761bc35 commit 7903423
Show file tree
Hide file tree
Showing 7 changed files with 145 additions and 80 deletions.
2 changes: 1 addition & 1 deletion packages/kokkos-kernels/master_history.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ tag: 3.3.00 date: 12/16/2020 master: 42defc56 release: e5279e55
tag: 3.3.01 date: 01/18/2021 master: f64b1c57 release: 4e1cc00b
tag: 3.4.00 date: 04/26/2021 master: fe439b21 release: d3c33910
tag: 3.4.01 date: 05/20/2021 master: 564dccb3 release: 4c62eb86
tag: 3.5.00 date: 10/27/2021 master: 00189c0b release: ffe069e7
tag: 3.5.00 date: 11/19/2021 master: 00189c0b release: f171533d
113 changes: 80 additions & 33 deletions packages/kokkos/algorithms/src/Kokkos_Sort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,10 @@ class BinSort {
//----------------------------------------
// Constructor: takes the keys, the binning_operator and optionally whether to
// sort within bins (default false)
BinSort(const_key_view_type keys_, int range_begin_, int range_end_,
BinSortOp bin_op_, bool sort_within_bins_ = false)
template <typename ExecutionSpace>
BinSort(const ExecutionSpace& exec, const_key_view_type keys_,
int range_begin_, int range_end_, BinSortOp bin_op_,
bool sort_within_bins_ = false)
: keys(keys_),
keys_rnd(keys_),
bin_op(bin_op_),
Expand All @@ -222,50 +224,63 @@ class BinSort {
"Kokkos::SortImpl::BinSortFunctor::bin_count", bin_op.max_bins());
bin_count_const = bin_count_atomic;
bin_offsets =
offset_type(view_alloc(WithoutInitializing,
offset_type(view_alloc(exec, WithoutInitializing,
"Kokkos::SortImpl::BinSortFunctor::bin_offsets"),
bin_op.max_bins());
sort_order =
offset_type(view_alloc(WithoutInitializing,
offset_type(view_alloc(exec, WithoutInitializing,
"Kokkos::SortImpl::BinSortFunctor::sort_order"),
range_end - range_begin);
}

BinSort(const_key_view_type keys_, int range_begin_, int range_end_,
BinSortOp bin_op_, bool sort_within_bins_ = false)
: BinSort(execution_space{}, keys_, range_begin_, range_end_, bin_op_,
sort_within_bins_) {}

template <typename ExecutionSpace>
BinSort(const ExecutionSpace& exec, const_key_view_type keys_,
BinSortOp bin_op_, bool sort_within_bins_ = false)
: BinSort(exec, keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {}

BinSort(const_key_view_type keys_, BinSortOp bin_op_,
bool sort_within_bins_ = false)
: BinSort(keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {}
: BinSort(execution_space{}, keys_, bin_op_, sort_within_bins_) {}

//----------------------------------------
// Create the permutation vector, the bin_offset array and the bin_count
// array. Can be called again if keys changed
void create_permute_vector() {
template <class ExecutionSpace = execution_space>
void create_permute_vector(const ExecutionSpace& exec = execution_space{}) {
const size_t len = range_end - range_begin;
Kokkos::parallel_for(
"Kokkos::Sort::BinCount",
Kokkos::RangePolicy<execution_space, bin_count_tag>(0, len), *this);
Kokkos::RangePolicy<ExecutionSpace, bin_count_tag>(exec, 0, len),
*this);
Kokkos::parallel_scan("Kokkos::Sort::BinOffset",
Kokkos::RangePolicy<execution_space, bin_offset_tag>(
0, bin_op.max_bins()),
Kokkos::RangePolicy<ExecutionSpace, bin_offset_tag>(
exec, 0, bin_op.max_bins()),
*this);

Kokkos::deep_copy(bin_count_atomic, 0);
Kokkos::deep_copy(exec, bin_count_atomic, 0);
Kokkos::parallel_for(
"Kokkos::Sort::BinBinning",
Kokkos::RangePolicy<execution_space, bin_binning_tag>(0, len), *this);
Kokkos::RangePolicy<ExecutionSpace, bin_binning_tag>(exec, 0, len),
*this);

if (sort_within_bins)
Kokkos::parallel_for(
"Kokkos::Sort::BinSort",
Kokkos::RangePolicy<execution_space, bin_sort_bins_tag>(
0, bin_op.max_bins()),
Kokkos::RangePolicy<ExecutionSpace, bin_sort_bins_tag>(
exec, 0, bin_op.max_bins()),
*this);
}

// Sort a subset of a view with respect to the first dimension using the
// permutation array
template <class ValuesViewType>
void sort(ValuesViewType const& values, int values_range_begin,
int values_range_end) const {
template <class ExecutionSpace, class ValuesViewType>
void sort(const ExecutionSpace& exec, ValuesViewType const& values,
int values_range_begin, int values_range_end) const {
using scratch_view_type =
Kokkos::View<typename ValuesViewType::data_type,
typename ValuesViewType::array_layout,
Expand All @@ -279,7 +294,7 @@ class BinSort {
}

scratch_view_type sorted_values(
view_alloc(WithoutInitializing,
view_alloc(exec, WithoutInitializing,
"Kokkos::SortImpl::BinSortFunctor::sorted_values"),
values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
values.rank_dynamic > 1 ? values.extent(1)
Expand Down Expand Up @@ -308,18 +323,31 @@ class BinSort {
values_range_begin - range_begin);

parallel_for("Kokkos::Sort::CopyPermute",
Kokkos::RangePolicy<execution_space>(0, len), functor);
Kokkos::RangePolicy<ExecutionSpace>(exec, 0, len), functor);
}

{
copy_functor<ValuesViewType, scratch_view_type> functor(
values, range_begin, sorted_values);

parallel_for("Kokkos::Sort::Copy",
Kokkos::RangePolicy<execution_space>(0, len), functor);
Kokkos::RangePolicy<ExecutionSpace>(exec, 0, len), functor);
}
}

// Sort a subset of a view with respect to the first dimension using the
// permutation array
template <class ValuesViewType>
void sort(ValuesViewType const& values, int values_range_begin,
int values_range_end) const {
execution_space exec;
sort(exec, values, values_range_begin, values_range_end);
exec.fence("Kokkos::Sort: fence after sorting");
}

execution_space().fence("Kokkos::Sort: fence after sorting");
template <class ExecutionSpace, class ValuesViewType>
void sort(ExecutionSpace const& exec, ValuesViewType const& values) const {
this->sort(exec, values, 0, /*values.extent(0)*/ range_end - range_begin);
}

template <class ValuesViewType>
Expand Down Expand Up @@ -485,8 +513,8 @@ struct BinOp3D {

namespace Impl {

template <class ViewType>
bool try_std_sort(ViewType view) {
template <class ViewType, class ExecutionSpace>
bool try_std_sort(ViewType view, const ExecutionSpace& exec) {
bool possible = true;
size_t stride[8] = {view.stride_0(), view.stride_1(), view.stride_2(),
view.stride_3(), view.stride_4(), view.stride_5(),
Expand All @@ -497,6 +525,7 @@ bool try_std_sort(ViewType view) {
possible = possible && (ViewType::Rank == 1);
possible = possible && (stride[0] == 1);
if (possible) {
exec.fence("Kokkos::sort: Fence before sorting on the host");
std::sort(view.data(), view.data() + view.extent(0));
}
return possible;
Expand All @@ -519,45 +548,63 @@ struct min_max_functor {

} // namespace Impl

template <class ViewType>
void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
template <class ExecutionSpace, class ViewType>
std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
const ExecutionSpace& exec, ViewType const& view,
bool const always_use_kokkos_sort = false) {
if (!always_use_kokkos_sort) {
if (Impl::try_std_sort(view)) return;
if (Impl::try_std_sort(view, exec)) return;
}
using CompType = BinOp1D<ViewType>;

Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
parallel_reduce("Kokkos::Sort::FindExtent",
Kokkos::RangePolicy<typename ViewType::execution_space>(
0, view.extent(0)),
exec, 0, view.extent(0)),
Impl::min_max_functor<ViewType>(view), reducer);
if (result.min_val == result.max_val) return;
BinSort<ViewType, CompType> bin_sort(
view, CompType(view.extent(0) / 2, result.min_val, result.max_val), true);
bin_sort.create_permute_vector();
bin_sort.sort(view);
bin_sort.create_permute_vector(exec);
bin_sort.sort(exec, view);
}

template <class ViewType>
void sort(ViewType view, size_t const begin, size_t const end) {
void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
typename ViewType::execution_space exec;
sort(exec, view, always_use_kokkos_sort);
exec.fence("Kokkos::Sort: fence after sorting");
}

template <class ExecutionSpace, class ViewType>
std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
const ExecutionSpace& exec, ViewType view, size_t const begin,
size_t const end) {
using range_policy = Kokkos::RangePolicy<typename ViewType::execution_space>;
using CompType = BinOp1D<ViewType>;

Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);

parallel_reduce("Kokkos::Sort::FindExtent", range_policy(begin, end),
parallel_reduce("Kokkos::Sort::FindExtent", range_policy(exec, begin, end),
Impl::min_max_functor<ViewType>(view), reducer);

if (result.min_val == result.max_val) return;

BinSort<ViewType, CompType> bin_sort(
view, begin, end,
exec, view, begin, end,
CompType((end - begin) / 2, result.min_val, result.max_val), true);

bin_sort.create_permute_vector();
bin_sort.sort(view, begin, end);
bin_sort.create_permute_vector(exec);
bin_sort.sort(exec, view, begin, end);
}

template <class ViewType>
void sort(ViewType view, size_t const begin, size_t const end) {
typename ViewType::execution_space exec;
sort(exec, view, begin, end);
exec.fence("Kokkos::Sort: fence after sorting");
}

} // namespace Kokkos
Expand Down
92 changes: 51 additions & 41 deletions packages/kokkos/algorithms/unit_tests/TestSort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,9 @@ void test_1D_sort_impl(unsigned int n, bool force_kokkos) {
KeyViewType keys("Keys", n);

// Test sorting array with all numbers equal
Kokkos::deep_copy(keys, KeyType(1));
Kokkos::sort(keys, force_kokkos);
ExecutionSpace exec;
Kokkos::deep_copy(exec, keys, KeyType(1));
Kokkos::sort(exec, keys, force_kokkos);

Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
Kokkos::fill_random(keys, g,
Expand All @@ -147,13 +148,16 @@ void test_1D_sort_impl(unsigned int n, bool force_kokkos) {
double sum_after = 0.0;
unsigned int sort_fails = 0;

Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys), sum_before);
Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
sum<ExecutionSpace, KeyType>(keys), sum_before);

Kokkos::sort(keys, force_kokkos);
Kokkos::sort(exec, keys, force_kokkos);

Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys), sum_after);
Kokkos::parallel_reduce(
n - 1, is_sorted_struct<ExecutionSpace, KeyType>(keys), sort_fails);
Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
sum<ExecutionSpace, KeyType>(keys), sum_after);
Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n - 1),
is_sorted_struct<ExecutionSpace, KeyType>(keys),
sort_fails);

double ratio = sum_before / sum_after;
double epsilon = 1e-10;
Expand All @@ -177,8 +181,10 @@ void test_3D_sort_impl(unsigned int n) {
double sum_after = 0.0;
unsigned int sort_fails = 0;

Kokkos::parallel_reduce(keys.extent(0), sum3D<ExecutionSpace, KeyType>(keys),
sum_before);
ExecutionSpace exec;
Kokkos::parallel_reduce(
Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0)),
sum3D<ExecutionSpace, KeyType>(keys), sum_before);

int bin_1d = 1;
while (bin_1d * bin_1d * bin_1d * 4 < (int)keys.extent(0)) bin_1d *= 2;
Expand All @@ -189,15 +195,17 @@ void test_3D_sort_impl(unsigned int n) {
using BinOp = Kokkos::BinOp3D<KeyViewType>;
BinOp bin_op(bin_max, min, max);
Kokkos::BinSort<KeyViewType, BinOp> Sorter(keys, bin_op, false);
Sorter.create_permute_vector();
Sorter.template sort<KeyViewType>(keys);

Kokkos::parallel_reduce(keys.extent(0), sum3D<ExecutionSpace, KeyType>(keys),
sum_after);
Kokkos::parallel_reduce(keys.extent(0) - 1,
bin3d_is_sorted_struct<ExecutionSpace, KeyType>(
keys, bin_1d, min[0], max[0]),
sort_fails);
Sorter.create_permute_vector(exec);
Sorter.sort(exec, keys);

Kokkos::parallel_reduce(
Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0)),
sum3D<ExecutionSpace, KeyType>(keys), sum_after);
Kokkos::parallel_reduce(
Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0) - 1),
bin3d_is_sorted_struct<ExecutionSpace, KeyType>(keys, bin_1d, min[0],
max[0]),
sort_fails);

double ratio = sum_before / sum_after;
double epsilon = 1e-10;
Expand Down Expand Up @@ -229,36 +237,36 @@ void test_dynamic_view_sort_impl(unsigned int n) {
KeyViewType keys_view("KeysTmp", n);

// Test sorting array with all numbers equal
Kokkos::deep_copy(keys_view, KeyType(1));
ExecutionSpace exec;
Kokkos::deep_copy(exec, keys_view, KeyType(1));
Kokkos::deep_copy(keys, keys_view);
Kokkos::sort(keys, 0 /* begin */, n /* end */);
Kokkos::sort(exec, keys, 0 /* begin */, n /* end */);

Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
Kokkos::fill_random(keys_view, g,
Kokkos::Random_XorShift64_Pool<
ExecutionSpace>::generator_type::MAX_URAND);

ExecutionSpace().fence();
exec.fence();
Kokkos::deep_copy(keys, keys_view);
// ExecutionSpace().fence();

double sum_before = 0.0;
double sum_after = 0.0;
unsigned int sort_fails = 0;

Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys_view),
sum_before);
Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
sum<ExecutionSpace, KeyType>(keys_view), sum_before);

Kokkos::sort(keys, 0 /* begin */, n /* end */);
Kokkos::sort(exec, keys, 0 /* begin */, n /* end */);

ExecutionSpace().fence(); // Need this fence to prevent BusError with Cuda
exec.fence(); // Need this fence to prevent BusError with Cuda
Kokkos::deep_copy(keys_view, keys);
// ExecutionSpace().fence();

Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys_view),
sum_after);
Kokkos::parallel_reduce(
n - 1, is_sorted_struct<ExecutionSpace, KeyType>(keys_view), sort_fails);
Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
sum<ExecutionSpace, KeyType>(keys_view), sum_after);
Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n - 1),
is_sorted_struct<ExecutionSpace, KeyType>(keys_view),
sort_fails);

double ratio = sum_before / sum_after;
double epsilon = 1e-10;
Expand Down Expand Up @@ -301,9 +309,10 @@ void test_issue_1160_impl() {
for (int i = 0; i < 10; ++i) {
h_v.access(i, 0) = h_x.access(i, 0) = double(h_element(i));
}
Kokkos::deep_copy(element_, h_element);
Kokkos::deep_copy(x_, h_x);
Kokkos::deep_copy(v_, h_v);
ExecutionSpace exec;
Kokkos::deep_copy(exec, element_, h_element);
Kokkos::deep_copy(exec, x_, h_x);
Kokkos::deep_copy(exec, v_, h_v);

using KeyViewType = decltype(element_);
using BinOp = Kokkos::BinOp1D<KeyViewType>;
Expand All @@ -316,15 +325,16 @@ void test_issue_1160_impl() {

Kokkos::BinSort<KeyViewType, BinOp> Sorter(element_, begin, end, binner,
false);
Sorter.create_permute_vector();
Sorter.sort(element_, begin, end);
Sorter.create_permute_vector(exec);
Sorter.sort(exec, element_, begin, end);

Sorter.sort(x_, begin, end);
Sorter.sort(v_, begin, end);
Sorter.sort(exec, x_, begin, end);
Sorter.sort(exec, v_, begin, end);

Kokkos::deep_copy(h_element, element_);
Kokkos::deep_copy(h_x, x_);
Kokkos::deep_copy(h_v, v_);
Kokkos::deep_copy(exec, h_element, element_);
Kokkos::deep_copy(exec, h_x, x_);
Kokkos::deep_copy(exec, h_v, v_);
exec.fence();

ASSERT_EQ(h_element(0), 9);
ASSERT_EQ(h_element(1), 8);
Expand Down
Loading

0 comments on commit 7903423

Please sign in to comment.