Skip to content

Commit

Permalink
Merge pull request #572 from brian-kelley/FixClusterOOB
Browse files Browse the repository at this point in the history
Fixed clustering, RCM, BFS accessing neighbors >= nv
  • Loading branch information
ndellingwood authored Jan 27, 2020
2 parents 7cc8819 + 768c7e6 commit 13ceecc
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 52 deletions.
19 changes: 6 additions & 13 deletions src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ namespace KokkosSparse{
struct BuildCrossClusterMaskFunctor
{
BuildCrossClusterMaskFunctor(Rowmap& rowmap_, Colinds& colinds_, nnz_view_t& clusterOffsets_, nnz_view_t& clusterVerts_, nnz_view_t& vertClusters_, bitset_t& mask_)
: rowmap(rowmap_), colinds(colinds_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), mask(mask_)
: numRows(rowmap_.extent(0) - 1), rowmap(rowmap_), colinds(colinds_), clusterOffsets(clusterOffsets_), clusterVerts(clusterVerts_), vertClusters(vertClusters_), mask(mask_)
{}

//Used a fixed-size hash set in shared memory
Expand Down Expand Up @@ -600,6 +600,9 @@ namespace KokkosSparse{
[&] (const nnz_lno_t j)
{
nnz_lno_t nei = colinds(rowmap(row) + j);
//Remote neighbors are not included
if(nei >= numRows)
return;
nnz_lno_t neiCluster = vertClusters(nei);
if(neiCluster != cluster)
{
Expand All @@ -622,6 +625,7 @@ namespace KokkosSparse{
return tableSize() * sizeof(int);
}

nnz_lno_t numRows;
Rowmap rowmap;
Colinds colinds;
nnz_view_t clusterOffsets;
Expand Down Expand Up @@ -776,21 +780,10 @@ namespace KokkosSparse{
raw_sym_xadj = raw_rowmap_t(sym_xadj.data(), sym_xadj.extent(0));
raw_sym_adj = raw_colinds_t(sym_adj.data(), sym_adj.extent(0));
}
bool onCuda = false;
#ifdef KOKKOS_ENABLE_CUDA
onCuda = std::is_same<MyExecSpace, Kokkos::Cuda>::value;
#endif
nnz_view_t vertClusters;
auto clusterAlgo = gsHandle->get_clustering_algo();
if(clusterAlgo == CLUSTER_DEFAULT)
{
//Use CM if > 50 entries per row, otherwise balloon clustering.
//CM is quite fast on CPUs if the level sets fan out quickly, otherwise slow and non-scalable.
if(!onCuda && (raw_sym_adj.extent(0) / num_rows > 50))
clusterAlgo = CLUSTER_CUTHILL_MCKEE;
else
clusterAlgo = CLUSTER_BALLOON;
}
clusterAlgo = CLUSTER_BALLOON;
switch(clusterAlgo)
{
case CLUSTER_CUTHILL_MCKEE:
Expand Down
62 changes: 23 additions & 39 deletions src/sparse/impl/KokkosSparse_partitioning_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ struct RCM
{
typedef Kokkos::View<nnz_lno_t**, MyTempMemorySpace, Kokkos::MemoryTraits<0>> WorkView;

BfsFunctor(WorkView& workQueue_, WorkView& scratch_, nnz_view_t& visit_, const_lno_row_view_t& rowmap_, const_lno_nnz_view_t& colinds_, single_view_t& numLevels_, nnz_view_t threadNeighborCounts_, nnz_lno_t start_, nnz_lno_t numRows_)
BfsFunctor(const WorkView& workQueue_, const WorkView& scratch_, const nnz_view_t& visit_, const const_lno_row_view_t& rowmap_, const const_lno_nnz_view_t& colinds_, const single_view_t& numLevels_, const nnz_view_t& threadNeighborCounts_, nnz_lno_t start_, nnz_lno_t numRows_)
: workQueue(workQueue_), scratch(scratch_), visit(visit_), rowmap(rowmap_), colinds(colinds_), numLevels(numLevels_), threadNeighborCounts(threadNeighborCounts_), start(start_), numRows(numRows_)
{}

Expand Down Expand Up @@ -310,7 +310,7 @@ struct RCM
{
nnz_lno_t col = colinds(j);
//use atomic here to guarantee neighbors are added to neighborList exactly once
if(Kokkos::atomic_compare_exchange_strong<nnz_lno_t>(&visit(col), NOT_VISITED, QUEUED))
if(col < numRows && Kokkos::atomic_compare_exchange_strong<nnz_lno_t>(&visit(col), NOT_VISITED, QUEUED))
{
//this thread is the first to see that col needs to be queued
neighborList(neiCount) = col;
Expand Down Expand Up @@ -394,9 +394,9 @@ struct RCM
nnz_lno_t numRows;
};

//parallel breadth-first search, producing level structure in (xadj, adj) form:
//xadj(level) gives index in adj where level begins
//also return the total number of levels
//Parallel breadth-first search, producing level structure in (xadj, adj) form:
//xadj(level) gives index in adj where level begins.
//Returns the total number of levels, and sets xadj, adj and maxDeg.
nnz_lno_t parallel_bfs(nnz_lno_t start, nnz_view_t& xadj, nnz_view_t& adj, nnz_lno_t& maxDeg, nnz_lno_t nthreads)
{
//need to know maximum degree to allocate scratch space for threads
Expand Down Expand Up @@ -426,9 +426,11 @@ struct RCM
{
typedef Kokkos::View<offset_t*, MyTempMemorySpace, Kokkos::MemoryTraits<0u>> ScoreView;

CuthillMcKeeFunctor(nnz_lno_t numLevels_, nnz_lno_t maxDegree_, const_lno_row_view_t& rowmap_, const_lno_nnz_view_t& colinds_, ScoreView& scores_, ScoreView& scoresAux_, nnz_view_t& visit_, nnz_view_t& xadj_, nnz_view_t& adj_, nnz_view_t& adjAux_)
CuthillMcKeeFunctor(nnz_lno_t numLevels_, nnz_lno_t maxDegree_, const const_lno_row_view_t& rowmap_, const const_lno_nnz_view_t& colinds_, const ScoreView& scores_, const ScoreView& scoresAux_, const nnz_view_t& visit_, const nnz_view_t& xadj_, const nnz_view_t& adj_, const nnz_view_t& adjAux_)
: numLevels(numLevels_), maxDegree(maxDegree_), rowmap(rowmap_), colinds(colinds_), scores(scores_), scoresAux(scoresAux_), visit(visit_), xadj(xadj_), adj(adj_), adjAux(adjAux_)
{}
{
numRows = rowmap.extent(0) - 1;
}

KOKKOS_INLINE_FUNCTION void operator()(const team_member_t mem) const
{
Expand All @@ -455,9 +457,12 @@ struct RCM
for(offset_t j = rowStart; j < rowEnd; j++)
{
nnz_lno_t neighbor = colinds(j);
nnz_lno_t neighborVisit = visit(neighbor);
if(neighborVisit < minNeighbor)
minNeighbor = neighborVisit;
if(neighbor < numRows)
{
nnz_lno_t neighborVisit = visit(neighbor);
if(neighborVisit < minNeighbor)
minNeighbor = neighborVisit;
}
}
scores(i) = ((offset_t) minNeighbor * (maxDegree + 1)) + (rowmap(process + 1) - rowmap(process));
}
Expand All @@ -480,6 +485,7 @@ struct RCM
}
}

nnz_lno_t numRows;
nnz_lno_t numLevels;
nnz_lno_t maxDegree;
const_lno_row_view_t rowmap;
Expand All @@ -497,7 +503,7 @@ struct RCM
//Does the reversing in "reverse Cuthill-McKee")
struct OrderReverseFunctor
{
OrderReverseFunctor(nnz_view_t& visit_, nnz_lno_t numRows_)
OrderReverseFunctor(const nnz_view_t& visit_, nnz_lno_t numRows_)
: visit(visit_), numRows(numRows_)
{}

Expand Down Expand Up @@ -549,7 +555,7 @@ struct RCM
struct MinDegreeRowFunctor
{
typedef typename Reducer::value_type Value;
MinDegreeRowFunctor(const_lno_row_view_t& rowmap_) : rowmap(rowmap_) {}
MinDegreeRowFunctor(const const_lno_row_view_t& rowmap_) : rowmap(rowmap_) {}
KOKKOS_INLINE_FUNCTION void operator()(const size_type i, Value& lval) const
{
size_type ideg = rowmap(i + 1) - rowmap(i);
Expand All @@ -565,7 +571,7 @@ struct RCM
//parallel-for functor that assigns a cluster given a envelope-reduced reordering (like RCM)
struct OrderToClusterFunctor
{
OrderToClusterFunctor(const nnz_view_t& ordering_, nnz_view_t vertClusters_, nnz_lno_t clusterSize_)
OrderToClusterFunctor(const nnz_view_t& ordering_, const nnz_view_t& vertClusters_, nnz_lno_t clusterSize_)
: ordering(ordering_), vertClusters(vertClusters_), clusterSize(clusterSize_)
{}

Expand All @@ -590,28 +596,6 @@ struct RCM
return v.loc;
}

size_type countBlockDiagEntries(nnz_view_t labels, nnz_lno_t blockSize)
{
auto labelsHost = Kokkos::create_mirror_view(labels);
Kokkos::deep_copy(labelsHost, labels);
auto rowmapHost = Kokkos::create_mirror_view(rowmap);
Kokkos::deep_copy(rowmapHost, rowmap);
auto colindsHost = Kokkos::create_mirror_view(colinds);
Kokkos::deep_copy(colindsHost, colinds);
size_type num = 0;
for(nnz_lno_t row = 0; row < numRows; row++)
{
nnz_lno_t rowLabel = labelsHost(row);
for(size_type j = rowmapHost(row); j < rowmapHost(row + 1); j++)
{
nnz_lno_t colLabel = labelsHost(colindsHost(j));
if(rowLabel / blockSize == colLabel / blockSize)
num++;
}
}
return num;
}

nnz_view_t cuthill_mckee()
{
nnz_lno_t periph = find_peripheral();
Expand Down Expand Up @@ -670,7 +654,7 @@ struct BalloonClustering
typedef Kokkos::TeamPolicy<MyExecSpace> team_policy_t ;
typedef typename team_policy_t::member_type team_member_t ;

BalloonClustering(size_type numRows_, lno_row_view_t& rowmap_, lno_nnz_view_t& colinds_)
BalloonClustering(size_type numRows_, const lno_row_view_t& rowmap_, const lno_nnz_view_t& colinds_)
: numRows(numRows_), rowmap(rowmap_), colinds(colinds_), randPool(0xDEADBEEF)
{}

Expand All @@ -688,7 +672,7 @@ struct BalloonClustering

struct BalloonFunctor
{
BalloonFunctor(nnz_view_t& vertClusters_, nnz_view_t& clusterCounts_, nnz_view_t& distances_, lno_row_view_t& row_map_, lno_nnz_view_t& col_inds_, float_view_t pressure_, nnz_lno_t clusterSize_, RandPool& randPool_)
BalloonFunctor(const nnz_view_t& vertClusters_, const nnz_view_t& clusterCounts_, const nnz_view_t& distances_, const lno_row_view_t& row_map_, const lno_nnz_view_t& col_inds_, const float_view_t& pressure_, nnz_lno_t clusterSize_, RandPool& randPool_)
: vertClusters(vertClusters_), clusterCounts(clusterCounts_), distances(distances_), row_map(row_map_), col_inds(col_inds_), pressure(pressure_), clusterSize(clusterSize_), numRows(row_map.extent(0) - 1), vertLocks(numRows), randPool(randPool_)
{
numClusters = (numRows + clusterSize - 1) / clusterSize;
Expand Down Expand Up @@ -738,7 +722,7 @@ struct BalloonClustering
for(size_type j = row_map(i); j < row_map(i + 1); j++)
{
nnz_lno_t nei = col_inds(j);
if(nei != i && vertClusters(nei) == cluster)
if(nei < numRows && nei != i && vertClusters(nei) == cluster)
{
//while we're at it, minimize distance to root as in Djikstra's
if(distances(nei) + 1 < distances(i))
Expand Down Expand Up @@ -767,7 +751,7 @@ struct BalloonClustering
{
nnz_lno_t nei = col_inds(j);
//to annex another vertex, it must be a non-root in a different cluster
if(nei != i && vertClusters(nei) != cluster && pressure(nei) < weakestPressure && distances(nei) != 0)
if(nei < numRows && nei != i && vertClusters(nei) != cluster && pressure(nei) < weakestPressure && distances(nei) != 0)
{
weakNei = nei;
weakestPressure = pressure(nei);
Expand Down
44 changes: 44 additions & 0 deletions unit_test/sparse/Test_Sparse_gauss_seidel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,47 @@ void test_balloon_clustering(lno_t numRows, size_type nnzPerRow, lno_t bandwidth
}
}

template <typename scalar_t, typename lno_t, typename size_type, typename device>
void test_sgs_zero_rows()
{
using namespace Test;
typedef typename KokkosSparse::CrsMatrix<scalar_t, lno_t, device, void, size_type> crsMat_t;
typedef typename crsMat_t::StaticCrsGraphType graph_t;
typedef typename graph_t::row_map_type::non_const_type row_map_type;
typedef typename graph_t::entries_type::non_const_type entries_type;
typedef typename crsMat_t::values_type::non_const_type scalar_view_t;
typedef KokkosKernelsHandle
<size_type, lno_t, scalar_t,
typename device::execution_space, typename device::memory_space,typename device::memory_space> KernelHandle;
//The rowmap of a zero-row matrix can be length 0 or 1, so Gauss-Seidel should work with both
//(the setup and apply are essentially no-ops but they shouldn't crash or throw exceptions)
//For this test, create size-0 and size-1 rowmaps separately, and make sure each work with both point and cluster
for(int doingCluster = 0; doingCluster < 2; doingCluster++)
{
for(int rowmapLen = 0; rowmapLen < 2; rowmapLen++)
{
KernelHandle kh;
if(doingCluster)
kh.create_gs_handle(CLUSTER_DEFAULT, 10);
else
kh.create_gs_handle(GS_DEFAULT);
//initialized to 0
row_map_type rowmap("Rowmap", rowmapLen);
entries_type entries("Entries", 0);
scalar_view_t values("Values", 0);
//also, make sure graph symmetrization doesn't crash on zero rows
gauss_seidel_symbolic(&kh, 0, 0, rowmap, entries, false);
gauss_seidel_numeric(&kh, 0, 0, rowmap, entries, values, false);
scalar_view_t x("X", 0);
scalar_view_t y("Y", 0);
scalar_t omega(0.9);
symmetric_gauss_seidel_apply
(&kh, 0, 0, rowmap, entries, values, x, y, false, true, omega, 3);
kh.destroy_gs_handle();
}
}
}

#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \
TEST_F( TestCategory, sparse ## _ ## gauss_seidel_asymmetric_rank1 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
test_gauss_seidel_rank1<SCALAR,ORDINAL,OFFSET,DEVICE>(2000, 2000 * 20, 200, 10, false); \
Expand All @@ -539,6 +580,9 @@ TEST_F( TestCategory, sparse ## _ ## gauss_seidel_symmetric_rank1 ## _ ## SCALAR
TEST_F( TestCategory, sparse ## _ ## gauss_seidel_symmetric_rank2 ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
test_gauss_seidel_rank2<SCALAR,ORDINAL,OFFSET,DEVICE>(2000, 2000 * 20, 200, 10, 3, true); \
} \
TEST_F( TestCategory, sparse ## _ ## gauss_seidel_zero_rows ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
test_sgs_zero_rows<SCALAR,ORDINAL,OFFSET,DEVICE>(); \
} \
TEST_F( TestCategory, sparse ## _ ## rcm ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \
test_rcm<SCALAR,ORDINAL,OFFSET,DEVICE>(10000, 50, 2000); \
} \
Expand Down

0 comments on commit 13ceecc

Please sign in to comment.