Skip to content

Commit

Permalink
Merge pull request #899 from iyamazaki/sptrsv-symbolic
Browse files Browse the repository at this point in the history
Supernodal SpTRSV, improve symbolic performance
  • Loading branch information
e10harvey authored Mar 18, 2021
2 parents 1fdcdbc + 0016058 commit d7fc6d8
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 35 deletions.
13 changes: 12 additions & 1 deletion src/sparse/KokkosSparse_sptrsv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,17 @@ namespace Experimental {
Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > Entries_Internal;


#ifdef KK_TRISOLVE_TIMERS
Kokkos::Timer timer_sptrsv;
#endif
RowMap_Internal rowmap_i = rowmap;
Entries_Internal entries_i = entries;

KokkosSparse::Impl::SPTRSV_SYMBOLIC<const_handle_type, RowMap_Internal, Entries_Internal>::sptrsv_symbolic (&tmp_handle, rowmap_i, entries_i);

#ifdef KK_TRISOLVE_TIMERS
std::cout << " > sptrsv_symbolic time = " << timer_sptrsv.seconds() << std::endl;
#endif
} // sptrsv_symbolic

template <typename KernelHandle,
Expand Down Expand Up @@ -167,6 +173,9 @@ namespace Experimental {
typename scalar_nnz_view_t_::device_type,
Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > Values_Internal;

#ifdef KK_TRISOLVE_TIMERS
Kokkos::Timer timer_sptrsv;
#endif
auto sptrsv_handle = handle->get_sptrsv_handle();
if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) {
RowMap_Internal rowmap_i = rowmap;
Expand All @@ -189,7 +198,9 @@ namespace Experimental {
else {
KokkosSparse::Experimental::sptrsv_symbolic (handle, rowmap, entries);
}

#ifdef KK_TRISOLVE_TIMERS
std::cout << " + sptrsv_symbolic time = " << timer_sptrsv.seconds() << std::endl;
#endif
} // sptrsv_symbolic

template <typename KernelHandle,
Expand Down
5 changes: 5 additions & 0 deletions src/sparse/KokkosSparse_sptrsv_superlu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ void sptrsv_symbolic(
#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
double time_seconds = tic.seconds ();
std::cout << " Conversion Time (from SuperLU to CSR): " << time_seconds << std::endl;
tic.reset();
#endif

// ===================================================================
Expand All @@ -313,6 +314,10 @@ void sptrsv_symbolic(
sptrsv_supernodal_symbolic (nsuper, supercols, etree,
graphL_host, kernelHandleL,
graphU_host, kernelHandleU);
#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
time_seconds = tic.seconds ();
std::cout << " SpTRSV Supernodal Symbolic Time : " << time_seconds << std::endl;
#endif
}


Expand Down
59 changes: 57 additions & 2 deletions src/sparse/KokkosSparse_sptrsv_supernode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,10 @@ void check_supernode_sizes(const char *title, int n, int nsuper, input_size_type
template <typename host_graph_t, typename graph_t, typename input_size_type>
host_graph_t
generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const input_size_type *nb) {
#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
double time_seconds = 0.0;
Kokkos::Timer timer;
#endif

using size_type = typename graph_t::size_type;
using cols_view_host_t = typename host_graph_t::entries_type::non_const_type;
Expand All @@ -476,13 +480,19 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu
// count non-empty supernodal blocks
row_map_view_host_t hr ("rowmap_view", nsuper+1);
integer_view_host_t check ("check", nsuper);
integer_view_host_t idxs ("idxs", nsuper);
Kokkos::deep_copy (hr, 0);
Kokkos::deep_copy (check, 0);

#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
timer.reset ();
#endif
int nblocks = 0;
for (int s = 0; s < nsuper; s++) {
int j1 = nb[s];
int j2 = j1+1; // based on the first row

size_type nidxs = 0;
for (size_type i = row_map_host (j1); i < row_map_host (j2); i++) {
int s2 = map (entries_host (i));
// supernodal blocks may not be filled with zeros
Expand All @@ -493,10 +503,16 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu
nblocks ++;
// count blocks per row for col_major
hr (s2+1) ++;
// keep track of non-zero block ids
idxs (nidxs) = s2;
nidxs ++;
}
}
// reset check
Kokkos::deep_copy (check, 0);
//Kokkos::deep_copy (check, 0);
for (size_type i = 0; i < nidxs; i++) {
check (idxs(i)) = 0;
}
}

cols_view_host_t hc ("colmap_view", nblocks);
Expand All @@ -506,11 +522,18 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu
hr (s+1) += hr (s);
}
}
#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
time_seconds = timer.seconds ();
std::cout << " > Generate Supernodal Graph: count blocks : " << time_seconds << std::endl;
timer.reset ();
#endif

nblocks = 0;
for (int s = 0; s < nsuper; s++) {
int j1 = nb[s];
int j2 = j1+1; // based on the first row

size_type nidxs = 0;
for (size_type i = row_map_host (j1); i < row_map_host (j2); i++) {
int s2 = map (entries_host (i));
// supernodal blocks may not be filled with zeros
Expand All @@ -525,19 +548,25 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu
hc (nblocks) = s2;
}
nblocks ++;
// keep track of non-zero block ids
idxs (nidxs) = s2;
nidxs ++;
}
}
if (!col_major) {
hr (s+1) = nblocks;
}
// reset check
if (!col_major) {
/*if (!col_major) {
for (size_type s2 = hr(s); s2 < hr(s+1); s2++) {
check (hc(s2)) = 0;
}
} else {
// NOTE: nonzero supernodes in s-th col are not stored
Kokkos::deep_copy (check, 0);
}*/
for (size_type i = 0; i < nidxs; i++) {
check (idxs(i)) = 0;
}
}
// fix hr
Expand All @@ -547,10 +576,21 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu
}
hr (0) = 0;
}
#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
time_seconds = timer.seconds ();
std::cout << " > Generate Supernodal Graph: compress graph : " << time_seconds
<< " (col_major = " << col_major << ")" << std::endl;
timer.reset ();
#endif

// sort column ids per row
for (int s = 0; s < nsuper; s++) {
std::sort(&(hc (hr (s))), &(hc (hr (s+1))));
}
#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
time_seconds = timer.seconds ();
std::cout << " > Generate Supernodal Graph: sort graph : " << time_seconds << std::endl << std::endl;
#endif

host_graph_t static_graph (hc, hr);
return static_graph;
Expand Down Expand Up @@ -1018,17 +1058,32 @@ void sptrsv_supernodal_symbolic(
// save the supernodal info in the handles for L/U solves
handleL->set_supernodes (nsuper, supercols_view, etree);
handleU->set_supernodes (nsuper, supercols_view, etree);
#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
time_seconds = tic.seconds ();
std::cout << " Deep-copy graph Time: " << time_seconds << std::endl;
tic.reset ();
#endif

if (handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG ||
handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) {
// generate supernodal graphs for DAG scheduling
auto supL = generate_supernodal_graph<host_graph_t> (!col_majorL, graphL_host, nsuper, supercols);
auto supU = generate_supernodal_graph<host_graph_t> ( col_majorU, graphU_host, nsuper, supercols);
#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
time_seconds = tic.seconds ();
std::cout << " Compute Supernodal Graph Time: " << time_seconds << std::endl;
tic.reset ();
#endif

auto dagL = generate_supernodal_dag<host_graph_t> (nsuper, supL, supU);
auto dagU = generate_supernodal_dag<host_graph_t> (nsuper, supU, supL);
handleL->set_supernodal_dag (dagL);
handleU->set_supernodal_dag (dagU);
#ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE
time_seconds = tic.seconds ();
std::cout << " Compute DAG Time: " << time_seconds << std::endl;
tic.reset ();
#endif
}

// ===================================================================
Expand Down
23 changes: 22 additions & 1 deletion src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2656,6 +2656,10 @@ cudaProfilerStop();

size_type node_count = 0;

#ifdef profile_supernodal_etree
Kokkos::Timer sptrsv_timer;
sptrsv_timer.reset();
#endif
for ( size_type lvl = 0; lvl < nlevels; ++lvl ) {
{
size_type lvl_nodes = hnodes_per_level(lvl);
Expand Down Expand Up @@ -2716,7 +2720,6 @@ cudaProfilerStart();
thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_ETREE ||
thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG) {

//#define profile_supernodal_etree
#ifdef profile_supernodal_etree
size_t flops = 0;
Kokkos::Timer timer;
Expand Down Expand Up @@ -2884,6 +2887,13 @@ cudaProfilerStop();
} // scope for if-block

} // end for lvl
#ifdef profile_supernodal_etree
Kokkos::fence();
double sptrsv_time_seconds = sptrsv_timer.seconds ();
std::cout << " + Execution space : " << execution_space::name () << std::endl;
std::cout << " + Memory space : " << memory_space::name () << std::endl;
std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl << std::endl;
#endif

} // end lower_tri_solve

Expand Down Expand Up @@ -2954,6 +2964,10 @@ cudaProfilerStop();
size_type node_count = 0;

// This must stay serial; would be nice to try out Cuda's graph stuff to reduce kernel launch overhead
#ifdef profile_supernodal_etree
Kokkos::Timer sptrsv_timer;
sptrsv_timer.reset();
#endif
for ( size_type lvl = 0; lvl < nlevels; ++lvl ) {
size_type lvl_nodes = hnodes_per_level(lvl);

Expand Down Expand Up @@ -3279,6 +3293,13 @@ cudaProfilerStop();
#endif
} // end if
} // end for lvl
#ifdef profile_supernodal_etree
Kokkos::fence();
double sptrsv_time_seconds = sptrsv_timer.seconds ();
std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl << std::endl;
std::cout <<" + Execution space : " << execution_space::name () << std::endl;
std::cout << " + Memory space : " << memory_space::name () << std::endl;
#endif

} // end upper_tri_solve

Expand Down
Loading

0 comments on commit d7fc6d8

Please sign in to comment.