Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Zoltan2: (Coloring) Fixed D1 performance bug, added timing argument to coloring code #9208

Merged
merged 2 commits into from
Jun 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
RCP<Environment> env;
RCP<const Teuchos::Comm<int> > comm;
bool verbose;

bool timing;

private:
//This function constructs a CSR with complete adjacency information for
Expand Down Expand Up @@ -688,8 +688,8 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
std::vector<int> recvcnts(comm->getSize(), 0);
Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);

//if we're computing statistics, remove the computation imbalance from the comm timer
if(verbose) comm->barrier();
//if we're reporting times, remove the computation imbalance from the comm timer
if(timing) comm->barrier();
double comm_total = 0.0;
double comm_temp = timer();

Expand Down Expand Up @@ -720,6 +720,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
const RCP<const Teuchos::Comm<int> > &comm_)
: adapter(adapter_), pl(pl_), env(env_), comm(comm_){
verbose = pl->get<bool>("verbose",false);
timing = pl->get<bool>("timing", false);
modelFlag_t flags;
flags.reset();
buildModel(flags);
Expand Down Expand Up @@ -1071,7 +1072,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
double conflict_detection = 0.0;

//Number of rounds we are saving statistics for
//100 is a decent default.
//100 is a decent default. Reporting requires --verbose argument.
const int numStatisticRecordingRounds = 100;

//includes all ghosts, including the second layer.
Expand Down Expand Up @@ -1322,7 +1323,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
//Done initializing, start coloring!

//use a barrier if we are reporting timing info
if(verbose) comm->barrier();
if(timing) comm->barrier();
interior_time = timer();
total_time = timer();
//give the entire local graph to KokkosKernels to color
Expand Down Expand Up @@ -1427,7 +1428,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
vertsPerRound[distributedRounds] = verts_to_recolor_size_host(0);
}

if(verbose) comm->barrier();
if(timing) comm->barrier();
double recolor_temp = timer();
//recolor using KokkosKernels' coloring function
if(verts_to_recolor_size_host(0) > 0){
Expand All @@ -1437,10 +1438,13 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
if(distributedRounds < numStatisticRecordingRounds){
recoloringPerRound[distributedRounds] = timer() - recolor_temp;
recoloring_time += recoloringPerRound[distributedRounds];
total_time += recoloringPerRound[distributedRounds];
comp_time += recoloringPerRound[distributedRounds];
compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
} else if(timing){
double recoloring_round_time = timer() - recolor_temp;
recoloring_time += recoloring_round_time;
comp_time += recoloring_round_time;
}

//reset the ghost colors to what they were before recoloring
Expand All @@ -1461,10 +1465,6 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
commPerRound[distributedRounds] = curr_comm_time;
recvPerRound[distributedRounds] = recv;
sentPerRound[distributedRounds] = sent;
if(verbose) {
std::cout<<comm->getRank()<<": total sent in round "<<distributedRounds<<" = "<<sent<<"\n";
std::cout<<comm->getRank()<<": total recv in round "<<distributedRounds<<" = "<<recv<<"\n";
}
totalPerRound[distributedRounds] += commPerRound[distributedRounds];
}

Expand Down Expand Up @@ -1507,7 +1507,11 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
comp_time += conflictDetectionPerRound[distributedRounds];
}
} else if(timing){
double conflict_detection_round_time = timer() - detection_temp;
conflict_detection += conflict_detection_round_time;
comp_time += conflict_detection_round_time;
}

distributedRounds++;
size_t localDone = recoloringSize_host(0);
Expand Down Expand Up @@ -1535,10 +1539,9 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
if(distributedRounds < numStatisticRecordingRounds){
vertsPerRound[distributedRounds] = recoloringSize_host(0);
}
if(verbose){
std::cout<<comm->getRank()<<": starting to recolor, serial\n";
comm->barrier();
}
if(verbose) std::cout<<comm->getRank()<<": starting to recolor, serial\n";
if(timing) comm->barrier();

double recolor_temp = timer();
if(verts_to_recolor_size_host(0) > 0){
this->colorInterior_serial(femv_colors.size(), dist_adjs_host, dist_offsets_host, femv,
Expand All @@ -1547,10 +1550,13 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
if(distributedRounds < numStatisticRecordingRounds){
recoloringPerRound[distributedRounds] = timer() - recolor_temp;
recoloring_time += recoloringPerRound[distributedRounds];
total_time += recoloringPerRound[distributedRounds];
comp_time += recoloringPerRound[distributedRounds];
compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
} else if(timing){
double recoloring_serial_round_time = timer() - recolor_temp;
recoloring_time += recoloring_serial_round_time;
comp_time += recoloring_serial_round_time;
}

//reset the ghost colors to their previous values to avoid
Expand All @@ -1566,10 +1572,6 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
commPerRound[distributedRounds] = curr_comm_time;
recvPerRound[distributedRounds] = recv;
sentPerRound[distributedRounds] = sent;
if(verbose) {
std::cout<<comm->getRank()<<": total sent in round "<<distributedRounds<<" = "<<sent<<"\n";
std::cout<<comm->getRank()<<": total recv in round "<<distributedRounds<<" = "<<recv<<"\n";
}
totalPerRound[distributedRounds] += commPerRound[distributedRounds];
}

Expand All @@ -1579,7 +1581,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
ghost_colors_host(i) = colors_host(i+n_local);
}

if(verbose) comm->barrier();
if(timing) comm->barrier();
double detection_temp = timer();

//zero these out, they'll be updated by detectConflicts_serial
Expand All @@ -1599,7 +1601,12 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
comp_time += conflictDetectionPerRound[distributedRounds];
}
} else if(timing){
double conflict_detection_serial_round_time = timer() - detection_temp;
conflict_detection += conflict_detection_serial_round_time;
comp_time += conflict_detection_serial_round_time;
}

size_t globalDone = 0;
size_t localDone = recoloringSize_host(0);
Teuchos::reduceAll<int,size_t>(*comm, Teuchos::REDUCE_SUM, 1, &localDone, &globalDone);
Expand Down Expand Up @@ -1656,6 +1663,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,recvPerRound,finalRecvPerRound);
Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound,finalSentPerRound);
printf("Rank %d: boundary size: %ld\n",comm->getRank(),localBoundaryVertices);
if(comm->getRank() == 0) printf("Total boundary size: %ld\n",totalBoundarySize);
for(int i = 0; i < std::min((int)distributedRounds,numStatisticRecordingRounds); i++){
printf("Rank %d: recolor %ld vertices in round %d\n",comm->getRank(), vertsPerRound[i],i);
printf("Rank %d: sentbuf had %lld entries in round %d\n", comm->getRank(), sentPerRound[i],i);
Expand All @@ -1672,6 +1680,7 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
printf("comp time in round %d: %f\n",i,finalCompPerRound[i]);
}
}
} else if (timing){
double global_total_time = 0.0;
double global_recoloring_time = 0.0;
double global_min_recoloring_time = 0.0;
Expand All @@ -1689,7 +1698,6 @@ class AlgTwoGhostLayer : public Algorithm<Adapter> {
comm->barrier();
fflush(stdout);
if(comm->getRank()==0){
printf("Boundary size: %ld\n",totalBoundarySize);
printf("Total Time: %f\n",global_total_time);
printf("Interior Time: %f\n",global_interior_time);
printf("Recoloring Time: %f\n",global_recoloring_time);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,9 @@ class AlgDistance1TwoGhostLayer : public AlgTwoGhostLayer<Adapter> {
}
},recoloringSize(0));
Kokkos::fence();
Kokkos::parallel_for(femv_colors.size(), KOKKOS_LAMBDA (const size_t& i){
Kokkos::parallel_for("rebuild verts_to_send and verts_to_recolor",
Kokkos::RangePolicy<ExecutionSpace>(0,femv_colors.size()),
KOKKOS_LAMBDA (const size_t& i){
if(femv_colors(i) == 0){
if(i < n_local){
verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,11 +232,15 @@ class AlgDistance1 : public Algorithm<Adapter>
}
}
},recoloringSize(0));
Kokkos::parallel_for(n_local, KOKKOS_LAMBDA(const int& i){
Kokkos::fence();
Kokkos::parallel_for("Rebuild verts_to_send_view",
Kokkos::RangePolicy<ExecutionSpace>(0,n_local),
KOKKOS_LAMBDA(const int& i){
if(femv_colors(i) == 0){
verts_to_send_view(verts_to_send_size_atomic(0)++) = i;
}
});
Kokkos::fence();
}

private:
Expand Down Expand Up @@ -317,8 +321,8 @@ class AlgDistance1 : public Algorithm<Adapter>
std::vector<int> recvcnts(comm->getSize(), 0);
Teuchos::ArrayView<int> recvcnts_view = Teuchos::arrayViewFromVector(recvcnts);

//if we're computing statistics, remove the computation imbalance from the comm timer.
if(verbose) comm->barrier();
//if we're reporting timings, remove the computation imbalance from the comm timer.
if(timing) comm->barrier();
double comm_total = 0.0;
double comm_temp = timer();

Expand Down Expand Up @@ -346,7 +350,7 @@ class AlgDistance1 : public Algorithm<Adapter>
RCP<Environment> env;
RCP<const Teuchos::Comm<int> > comm;
bool verbose;

bool timing;
public:
//constructor for the hybrid distributed distance-1 algorithm
AlgDistance1(
Expand All @@ -356,6 +360,7 @@ class AlgDistance1 : public Algorithm<Adapter>
const RCP<const Teuchos::Comm<int> > &comm_)
: adapter(adapter_), pl(pl_), env(env_), comm(comm_) {
verbose = pl->get<bool>("verbose",false);
timing = pl->get<bool>("timing", false);
if(verbose) std::cout<<comm->getRank()<<": inside coloring constructor\n";
modelFlag_t flags;
flags.reset();
Expand Down Expand Up @@ -724,18 +729,18 @@ class AlgDistance1 : public Algorithm<Adapter>
if(verbose) std::cout<<comm->getRank()<<": Coloring interior\n";
//initialize interior and total timers, barrier to prevent any imbalance from setup.
//Only use a barrier if timing is happening.
if(verbose) comm->barrier();
if(timing) comm->barrier();
interior_time = timer();
total_time = timer();
//call the KokkosKernels coloring function with the Tpetra default spaces.
bool use_vbbit = (global_max_degree < 6000);
this->colorInterior<execution_space,memory_space>
(nVtx, dist_adjs, dist_offsets, femv,dist_adjs,0,use_vbbit);
if(verbose){
if(timing){
interior_time = timer() - interior_time;
comp_time = interior_time;
std::cout<<comm->getRank()<<": Going to recolor\n";
}
if(verbose) std::cout<<comm->getRank()<<": Going to recolor\n";
bool recolor_degrees = this->pl->template get<bool>("recolor_degrees", true);

//if there is more than a single process, check distributed conflicts and recolor
Expand Down Expand Up @@ -819,7 +824,6 @@ class AlgDistance1 : public Algorithm<Adapter>
if(distributedRounds < numStatisticRecordingRounds) {
vertsPerRound[distributedRounds] = recoloringSize_host(0);
}
if(verbose) std::cout<<comm->getRank()<<": starting to recolor\n";

//copying the send view to the recolor view is necessary because
//KokkosKernels can change the view passed in, and we need the send view
Expand All @@ -844,9 +848,12 @@ class AlgDistance1 : public Algorithm<Adapter>
comp_time += recoloringPerRound[distributedRounds];
compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
} else if(timing) {
double recolor_round_time = timer() - recolor_temp;
recoloring_time += recolor_round_time;
comp_time += recolor_round_time;
}

if(verbose) std::cout<<comm->getRank()<<": done recoloring\n";
//reset the recoloringSize device host and device views
//to zero
recoloringSize_host(0) = 0;
Expand Down Expand Up @@ -874,10 +881,6 @@ class AlgDistance1 : public Algorithm<Adapter>
comm_time += curr_comm_time;
if(distributedRounds < numStatisticRecordingRounds){
commPerRound[distributedRounds] = curr_comm_time;
if(verbose){
std::cout<<comm->getRank()<<": total sent in round "<<distributedRounds<<" = "<<sent<<"\n";
std::cout<<comm->getRank()<<": total recv in round "<<distributedRounds<<" = "<<recv<<"\n";
}
sentPerRound[distributedRounds] = sent;
recvPerRound[distributedRounds] = recv;
totalPerRound[distributedRounds] += commPerRound[distributedRounds];
Expand Down Expand Up @@ -913,6 +916,10 @@ class AlgDistance1 : public Algorithm<Adapter>
compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
comp_time += conflictDetectionPerRound[distributedRounds];
} else if(timing){
double conflict_detection_round_time = timer()- detection_temp;
conflict_detection += conflict_detection_round_time;
comp_time += conflict_detection_round_time;
}
//do a reduction to determine if we're done
int globalDone = 0;
Expand Down Expand Up @@ -942,7 +949,6 @@ class AlgDistance1 : public Algorithm<Adapter>
if(distributedRounds < 100){
vertsPerRound[distributedRounds] = recoloringSize_host(0);
}
if(verbose) std::cout<<comm->getRank()<<": starting to recolor, serial\n";

double recolor_temp = timer();
//use KokkosKernels to recolor the conflicting vertices
Expand All @@ -958,9 +964,12 @@ class AlgDistance1 : public Algorithm<Adapter>
comp_time += recoloringPerRound[distributedRounds];
compPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
totalPerRound[distributedRounds] = recoloringPerRound[distributedRounds];
}
} else if(timing){
double recolor_serial_round_time = timer() - recolor_temp;
recoloring_time += recolor_serial_round_time;
comp_time += recolor_serial_round_time;
}

if(verbose) std::cout<<comm->getRank()<<": done recoloring\n";
recoloringSize_host(0) = 0;

for(size_t i = 0; i < rand.size() -nVtx; i++){
Expand All @@ -980,10 +989,6 @@ class AlgDistance1 : public Algorithm<Adapter>

if(distributedRounds < numStatisticRecordingRounds){
commPerRound[distributedRounds] = curr_comm_time;
if(verbose){
std::cout<<comm->getRank()<<": total sent in round "<<distributedRounds<<" = "<<sent<<"\n";
std::cout<<comm->getRank()<<": total recv in round "<<distributedRounds<<" = "<<recv<<"\n";
}
sentPerRound[distributedRounds] = sent;
recvPerRound[distributedRounds] = recv;
totalPerRound[distributedRounds] += commPerRound[distributedRounds];
Expand Down Expand Up @@ -1012,7 +1017,11 @@ class AlgDistance1 : public Algorithm<Adapter>
compPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
totalPerRound[distributedRounds] += conflictDetectionPerRound[distributedRounds];
comp_time += conflictDetectionPerRound[distributedRounds];
}
} else if(timing){
double conflict_detection_serial_round_time = timer() - detection_temp;
conflict_detection += conflict_detection_serial_round_time;
comp_time += conflict_detection_serial_round_time;
}
//do a reduction to determine if we're done
int globalDone = 0;
int localDone = recoloringSize_host(0);
Expand All @@ -1036,8 +1045,8 @@ class AlgDistance1 : public Algorithm<Adapter>
}
//print how many rounds of speculating/correcting happened (this should be the same for all ranks):
if(comm->getRank()==0) printf("did %d rounds of distributed coloring\n", distributedRounds);
int totalBoundarySize = 0;
int totalVertsPerRound[numStatisticRecordingRounds];
int totalBoundarySize = 0;
double finalTotalPerRound[numStatisticRecordingRounds];
double maxRecoloringPerRound[numStatisticRecordingRounds];
double finalSerialRecoloringPerRound[numStatisticRecordingRounds];
Expand Down Expand Up @@ -1072,6 +1081,7 @@ class AlgDistance1 : public Algorithm<Adapter>
Teuchos::reduceAll<int,gno_t> (*comm, Teuchos::REDUCE_SUM,numStatisticRecordingRounds,sentPerRound, finalSentPerRound);

printf("Rank %d: boundary size: %d\n",comm->getRank(),localBoundaryVertices);
if(comm->getRank()==0) printf("Total boundary size: %d\n",totalBoundarySize);
for(int i = 0; i < std::min(distributedRounds,numStatisticRecordingRounds); i++){
printf("Rank %d: recolor %d vertices in round %d\n",comm->getRank(),vertsPerRound[i],i);
if(comm->getRank()==0) printf("recolored %d vertices in round %d\n",totalVertsPerRound[i],i);
Expand All @@ -1085,7 +1095,7 @@ class AlgDistance1 : public Algorithm<Adapter>
if(comm->getRank()==0) printf("total recv in round %d: %lld\n",i,finalRecvPerRound[i]);
if(comm->getRank()==0) printf("comp time in round %d: %f\n",i,finalCompPerRound[i]);
}
} else if(timing){
double global_total_time = 0.0;
double global_recoloring_time=0.0;
double global_min_recoloring_time=0.0;
Expand All @@ -1103,7 +1113,6 @@ class AlgDistance1 : public Algorithm<Adapter>
comm->barrier();
fflush(stdout);
if(comm->getRank()==0){
printf("Boundary size: %d\n",totalBoundarySize);
printf("Total Time: %f\n",global_total_time);
printf("Interior Time: %f\n",global_interior_time);
printf("Recoloring Time: %f\n",global_recoloring_time);
Expand All @@ -1112,8 +1121,8 @@ class AlgDistance1 : public Algorithm<Adapter>
printf("Comm Time: %f\n",global_comm_time);
printf("Comp Time: %f\n",global_comp_time);
}
std::cout<<comm->getRank()<<": exiting coloring\n";
}
if(verbose) std::cout<<comm->getRank()<<": exiting coloring\n";
}
};

Expand Down
Loading