From 768c7e6e41a9fbb8f0d4163e59474410ea839ed0 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 21 Jan 2020 16:12:34 -0700 Subject: [PATCH] Fixed clustering, RCM, BFS accessing neighbors >= nv Tpetra graphs contain remote columns (entries) that can't be used as row indices (so the "local" graph may be symmetric, but entries >= #numRows must be filtered). Added this filtering to RCM, BFS and balloon clustering. Also added a test to Gauss-Seidel for zero-row case, where the input row map can have length 0 or 1. Ran clean through valgrind in openmp. --- ...KokkosSparse_cluster_gauss_seidel_impl.hpp | 19 ++---- .../impl/KokkosSparse_partitioning_impl.hpp | 62 +++++++------------ unit_test/sparse/Test_Sparse_gauss_seidel.hpp | 44 +++++++++++++ 3 files changed, 73 insertions(+), 52 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index 6aba3caad9..e8e0d6e4db 100644 --- a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -529,7 +529,7 @@ namespace KokkosSparse{ struct BuildCrossClusterMaskFunctor { BuildCrossClusterMaskFunctor(Rowmap& rowmap_, Colinds& colinds_, nnz_view_t& clusterOffsets_, nnz_view_t& clusterVerts_, nnz_view_t& vertClusters_, bitset_t& mask_) - : rowmap(rowmap_), colinds(colinds_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), mask(mask_) + : numRows(rowmap_.extent(0) - 1), rowmap(rowmap_), colinds(colinds_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), mask(mask_) {} //Used a fixed-size hash set in shared memory @@ -600,6 +600,9 @@ namespace KokkosSparse{ [&] (const nnz_lno_t j) { nnz_lno_t nei = colinds(rowmap(row) + j); + //Remote neighbors are not included + if(nei >= numRows) + return; nnz_lno_t neiCluster = vertClusters(nei); if(neiCluster != cluster) { @@ -622,6 +625,7 @@ namespace KokkosSparse{ return tableSize() * sizeof(int); } + nnz_lno_t numRows; Rowmap rowmap; Colinds colinds; nnz_view_t clusterOffsets; @@ -776,21 +780,10 @@ namespace KokkosSparse{ raw_sym_xadj = raw_rowmap_t(sym_xadj.data(), sym_xadj.extent(0)); raw_sym_adj = raw_colinds_t(sym_adj.data(), sym_adj.extent(0)); } - bool onCuda = false; -#ifdef KOKKOS_ENABLE_CUDA - onCuda = std::is_same::value; -#endif nnz_view_t vertClusters; auto clusterAlgo = gsHandle->get_clustering_algo(); if(clusterAlgo == CLUSTER_DEFAULT) - { - //Use CM if > 50 entries per row, otherwise balloon clustering. - //CM is quite fast on CPUs if the level sets fan out quickly, otherwise slow and non-scalable. - if(!onCuda && (raw_sym_adj.extent(0) / num_rows > 50)) - clusterAlgo = CLUSTER_CUTHILL_MCKEE; - else - clusterAlgo = CLUSTER_BALLOON; - } + clusterAlgo = CLUSTER_BALLOON; switch(clusterAlgo) { case CLUSTER_CUTHILL_MCKEE: diff --git a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp index 64e5d804d4..82ed7eb205 100644 --- a/src/sparse/impl/KokkosSparse_partitioning_impl.hpp +++ b/src/sparse/impl/KokkosSparse_partitioning_impl.hpp @@ -256,7 +256,7 @@ struct RCM { typedef Kokkos::View> WorkView; - BfsFunctor(WorkView& workQueue_, WorkView& scratch_, nnz_view_t& visit_, const_lno_row_view_t& rowmap_, const_lno_nnz_view_t& colinds_, single_view_t& numLevels_, nnz_view_t threadNeighborCounts_, nnz_lno_t start_, nnz_lno_t numRows_) + BfsFunctor(const WorkView& workQueue_, const WorkView& scratch_, const nnz_view_t& visit_, const const_lno_row_view_t& rowmap_, const const_lno_nnz_view_t& colinds_, const single_view_t& numLevels_, const nnz_view_t& threadNeighborCounts_, nnz_lno_t start_, nnz_lno_t numRows_) : workQueue(workQueue_), scratch(scratch_), visit(visit_), rowmap(rowmap_), colinds(colinds_), numLevels(numLevels_), threadNeighborCounts(threadNeighborCounts_), start(start_), numRows(numRows_) {} @@ -310,7 +310,7 @@ struct RCM { nnz_lno_t col = colinds(j); //use atomic here to guarantee neighbors are added to neighborList exactly once - if(Kokkos::atomic_compare_exchange_strong(&visit(col), NOT_VISITED, QUEUED)) + if(col < numRows && Kokkos::atomic_compare_exchange_strong(&visit(col), NOT_VISITED, QUEUED)) { //this thread is the first to see that col needs to be queued neighborList(neiCount) = col; @@ -394,9 +394,9 @@ struct RCM nnz_lno_t numRows; }; - //parallel breadth-first search, producing level structure in (xadj, adj) form: - //xadj(level) gives index in adj where level begins - //also return the total number of levels + //Parallel breadth-first search, producing level structure in (xadj, adj) form: + //xadj(level) gives index in adj where level begins. + //Returns the total number of levels, and sets xadj, adj and maxDeg. nnz_lno_t parallel_bfs(nnz_lno_t start, nnz_view_t& xadj, nnz_view_t& adj, nnz_lno_t& maxDeg, nnz_lno_t nthreads) { //need to know maximum degree to allocate scratch space for threads @@ -426,9 +426,11 @@ struct RCM { typedef Kokkos::View> ScoreView; - CuthillMcKeeFunctor(nnz_lno_t numLevels_, nnz_lno_t maxDegree_, const_lno_row_view_t& rowmap_, const_lno_nnz_view_t& colinds_, ScoreView& scores_, ScoreView& scoresAux_, nnz_view_t& visit_, nnz_view_t& xadj_, nnz_view_t& adj_, nnz_view_t& adjAux_) + CuthillMcKeeFunctor(nnz_lno_t numLevels_, nnz_lno_t maxDegree_, const const_lno_row_view_t& rowmap_, const const_lno_nnz_view_t& colinds_, const ScoreView& scores_, const ScoreView& scoresAux_, const nnz_view_t& visit_, const nnz_view_t& xadj_, const nnz_view_t& adj_, const nnz_view_t& adjAux_) : numLevels(numLevels_), maxDegree(maxDegree_), rowmap(rowmap_), colinds(colinds_), scores(scores_), scoresAux(scoresAux_), visit(visit_), xadj(xadj_), adj(adj_), adjAux(adjAux_) - {} + { + numRows = rowmap.extent(0) - 1; + } KOKKOS_INLINE_FUNCTION void operator()(const team_member_t mem) const { @@ -455,9 +457,12 @@ struct RCM for(offset_t j = rowStart; j < rowEnd; j++) { nnz_lno_t neighbor = colinds(j); - nnz_lno_t neighborVisit = visit(neighbor); - if(neighborVisit < minNeighbor) - minNeighbor = neighborVisit; + if(neighbor < numRows) + { + nnz_lno_t neighborVisit = visit(neighbor); + if(neighborVisit < minNeighbor) + minNeighbor = neighborVisit; + } } scores(i) = ((offset_t) minNeighbor * (maxDegree + 1)) + (rowmap(process + 1) - rowmap(process)); } @@ -480,6 +485,7 @@ struct RCM } } + nnz_lno_t numRows; nnz_lno_t numLevels; nnz_lno_t maxDegree; const_lno_row_view_t rowmap; @@ -497,7 +503,7 @@ struct RCM //Does the reversing in "reverse Cuthill-McKee") struct OrderReverseFunctor { - OrderReverseFunctor(nnz_view_t& visit_, nnz_lno_t numRows_) + OrderReverseFunctor(const nnz_view_t& visit_, nnz_lno_t numRows_) : visit(visit_), numRows(numRows_) {} @@ -549,7 +555,7 @@ struct RCM struct MinDegreeRowFunctor { typedef typename Reducer::value_type Value; - MinDegreeRowFunctor(const_lno_row_view_t& rowmap_) : rowmap(rowmap_) {} + MinDegreeRowFunctor(const const_lno_row_view_t& rowmap_) : rowmap(rowmap_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_type i, Value& lval) const { size_type ideg = rowmap(i + 1) - rowmap(i); @@ -565,7 +571,7 @@ struct RCM //parallel-for functor that assigns a cluster given a envelope-reduced reordering (like RCM) struct OrderToClusterFunctor { - OrderToClusterFunctor(const nnz_view_t& ordering_, nnz_view_t vertClusters_, nnz_lno_t clusterSize_) + OrderToClusterFunctor(const nnz_view_t& ordering_, const nnz_view_t& vertClusters_, nnz_lno_t clusterSize_) : ordering(ordering_), vertClusters(vertClusters_), clusterSize(clusterSize_) {} @@ -590,28 +596,6 @@ struct RCM return v.loc; } - size_type countBlockDiagEntries(nnz_view_t labels, nnz_lno_t blockSize) - { - auto labelsHost = Kokkos::create_mirror_view(labels); - Kokkos::deep_copy(labelsHost, labels); - auto rowmapHost = Kokkos::create_mirror_view(rowmap); - Kokkos::deep_copy(rowmapHost, rowmap); - auto colindsHost = Kokkos::create_mirror_view(colinds); - Kokkos::deep_copy(colindsHost, colinds); - size_type num = 0; - for(nnz_lno_t row = 0; row < numRows; row++) - { - nnz_lno_t rowLabel = labelsHost(row); - for(size_type j = rowmapHost(row); j < rowmapHost(row + 1); j++) - { - nnz_lno_t colLabel = labelsHost(colindsHost(j)); - if(rowLabel / blockSize == colLabel / blockSize) - num++; - } - } - return num; - } - nnz_view_t cuthill_mckee() { nnz_lno_t periph = find_peripheral(); @@ -670,7 +654,7 @@ struct BalloonClustering typedef Kokkos::TeamPolicy team_policy_t ; typedef typename team_policy_t::member_type team_member_t ; - BalloonClustering(size_type numRows_, lno_row_view_t& rowmap_, lno_nnz_view_t& colinds_) + BalloonClustering(size_type numRows_, const lno_row_view_t& rowmap_, const lno_nnz_view_t& colinds_) : numRows(numRows_), rowmap(rowmap_), colinds(colinds_), randPool(0xDEADBEEF) {} @@ -688,7 +672,7 @@ struct BalloonClustering struct BalloonFunctor { - BalloonFunctor(nnz_view_t& vertClusters_, nnz_view_t& clusterCounts_, nnz_view_t& distances_, lno_row_view_t& row_map_, lno_nnz_view_t& col_inds_, float_view_t pressure_, nnz_lno_t clusterSize_, RandPool& randPool_) + BalloonFunctor(const nnz_view_t& vertClusters_, const nnz_view_t& clusterCounts_, const nnz_view_t& distances_, const lno_row_view_t& row_map_, const lno_nnz_view_t& col_inds_, const float_view_t& pressure_, nnz_lno_t clusterSize_, RandPool& randPool_) : vertClusters(vertClusters_), clusterCounts(clusterCounts_), distances(distances_), row_map(row_map_), col_inds(col_inds_), pressure(pressure_), clusterSize(clusterSize_), numRows(row_map.extent(0) - 1), vertLocks(numRows), randPool(randPool_) { numClusters = (numRows + clusterSize - 1) / clusterSize; @@ -738,7 +722,7 @@ struct BalloonClustering for(size_type j = row_map(i); j < row_map(i + 1); j++) { nnz_lno_t nei = col_inds(j); - if(nei != i && vertClusters(nei) == cluster) + if(nei < numRows && nei != i && vertClusters(nei) == cluster) { //while we're at it, minimize distance to root as in Djikstra's if(distances(nei) + 1 < distances(i)) @@ -767,7 +751,7 @@ struct BalloonClustering { nnz_lno_t nei = col_inds(j); //to annex another vertex, it must be a non-root in a different cluster - if(nei != i && vertClusters(nei) != cluster && pressure(nei) < weakestPressure && distances(nei) != 0) + if(nei < numRows && nei != i && vertClusters(nei) != cluster && pressure(nei) < weakestPressure && distances(nei) != 0) { weakNei = nei; weakestPressure = pressure(nei); diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp index af5e920e38..f105f9fb5e 100644 --- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp @@ -526,6 +526,47 @@ void test_balloon_clustering(lno_t numRows, size_type nnzPerRow, lno_t bandwidth } } +template +void test_sgs_zero_rows() +{ + using namespace Test; + typedef typename KokkosSparse::CrsMatrix crsMat_t; + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_type; + typedef typename graph_t::entries_type::non_const_type entries_type; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + typedef KokkosKernelsHandle + KernelHandle; + //The rowmap of a zero-row matrix can be length 0 or 1, so Gauss-Seidel should work with both + //(the setup and apply are essentially no-ops but they shouldn't crash or throw exceptions) + //For this test, create size-0 and size-1 rowmaps separately, and make sure each work with both point and cluster + for(int doingCluster = 0; doingCluster < 2; doingCluster++) + { + for(int rowmapLen = 0; rowmapLen < 2; rowmapLen++) + { + KernelHandle kh; + if(doingCluster) + kh.create_gs_handle(CLUSTER_DEFAULT, 10); + else + kh.create_gs_handle(GS_DEFAULT); + //initialized to 0 + row_map_type rowmap("Rowmap", rowmapLen); + entries_type entries("Entries", 0); + scalar_view_t values("Values", 0); + //also, make sure graph symmetrization doesn't crash on zero rows + gauss_seidel_symbolic(&kh, 0, 0, rowmap, entries, false); + gauss_seidel_numeric(&kh, 0, 0, rowmap, entries, values, false); + scalar_view_t x("X", 0); + scalar_view_t y("Y", 0); + scalar_t omega(0.9); + symmetric_gauss_seidel_apply + (&kh, 0, 0, rowmap, entries, values, x, y, false, true, omega, 3); + kh.destroy_gs_handle(); + } + } +} + #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( TestCategory, sparse ## _ ## gauss_seidel_asymmetric_rank1 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ test_gauss_seidel_rank1(2000, 2000 * 20, 200, 10, false); \ @@ -539,6 +580,9 @@ TEST_F( TestCategory, sparse ## _ ## gauss_seidel_symmetric_rank1 ## _ ## SCALAR TEST_F( TestCategory, sparse ## _ ## gauss_seidel_symmetric_rank2 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ test_gauss_seidel_rank2(2000, 2000 * 20, 200, 10, 3, true); \ } \ +TEST_F( TestCategory, sparse ## _ ## gauss_seidel_zero_rows ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ + test_sgs_zero_rows(); \ +} \ TEST_F( TestCategory, sparse ## _ ## rcm ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ test_rcm(10000, 50, 2000); \ } \