Skip to content

Commit

Permalink
Merge pull request #9 from hatakeyamak/PFRecHitAndCluster_GPU_12_5_ha…
Browse files Browse the repository at this point in the history
…ckason2_tmp

Integrate developments from Marino
  • Loading branch information
hatakeyamak authored Oct 4, 2022
2 parents 7f2d2b7 + 6ca17e5 commit cd7994d
Showing 1 changed file with 18 additions and 150 deletions.
168 changes: 18 additions & 150 deletions RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3524,7 +3524,6 @@ namespace PFClusterCudaHCAL {
__global__ void topoClusterContraction(size_t size,
int* pfrh_parent,
int* pfrh_isSeed,
//const int* pfrh_neighbours, // temporary inputs for debuggi
int* rhCount,
int* topoSeedCount,
int* topoRHCount,
Expand Down Expand Up @@ -3561,30 +3560,6 @@ namespace PFClusterCudaHCAL {

} while (notDone);

// // debugging printing block
// __syncthreads();
// if (threadIdx.x == 0) {
// int nnode=0;
// for (int i = 0; i < size; i++) {
// //printf("final pfrh_id,parent: %d %d\n",i,pfrh_parent[i]);
// if (i==pfrh_parent[i]) nnode++;
// }
// printf("pfrh_parent 3 multiplicity: %d\n",nnode);
// for (int pos = 0; pos < size; pos++) {
// int parent_target = pfrh_parent[pos];
// for (int i = 0; i < 8; i++) {
// int neighbor_id = pfrh_neighbours[pos * 8 + i];
// if (neighbor_id>-1){ // valid neighbors
// int parent_neighbor = pfrh_parent[neighbor_id];
// if (parent_target!=parent_neighbor){
// printf("hmm. they should have the same parent, but they don't. why... %d %d\n",pos,neighbor_id);
// }
// }
// }
// }
// }
// __syncthreads();

// Now determine the number of seeds and rechits in each topo cluster
for (int rhIdx = threadIdx.x; rhIdx < size; rhIdx += blockDim.x) {
int topoId = pfrh_parent[rhIdx];
Expand Down Expand Up @@ -3635,6 +3610,7 @@ namespace PFClusterCudaHCAL {
}
}
__syncthreads();

if (threadIdx.x == 0) {
*pcrhFracSize = totalSeedFracOffset;
if (*pcrhFracSize>200000) // DeclsForKernels.h maxPFCFracs
Expand Down Expand Up @@ -3807,8 +3783,11 @@ namespace PFClusterCudaHCAL {
else
pfrh_edgeMask[idx] = 0;
}
__syncthreads();//!!

do {
__syncthreads();//!!

if (threadIdx.x == 0) {
notDone = 0;
}
Expand Down Expand Up @@ -3840,6 +3819,9 @@ namespace PFClusterCudaHCAL {
}
}
}

__syncthreads();//!!

if (threadIdx.x == 0)
iter++;

Expand All @@ -3853,7 +3835,6 @@ namespace PFClusterCudaHCAL {
if (threadIdx.x == 0) {
notDone = 0;
}

__syncthreads();

// Even linking
Expand All @@ -3864,7 +3845,6 @@ namespace PFClusterCudaHCAL {
pfrh_parent[i] = (int)max(i, pfrh_edgeList[idx]);
}
}

__syncthreads();

// edgeParent
Expand All @@ -3885,25 +3865,23 @@ namespace PFClusterCudaHCAL {
}
}

__syncthreads();//!!

if (threadIdx.x == 0)
iter++;

__syncthreads();

} while (notDone==1);

*topoIter = iter;
#ifdef DEBUG_GPU_HCAL
// if (threadIdx.x == 0) {
// printf("*** Topo clustering converged in %d iterations ***\n", iter);
// }
// __syncthreads();
#endif
__syncthreads();//!!

if (threadIdx.x == 0)
*topoIter = iter;
}

__global__ void topoClusterLinkingKH(int nRH,
int* nEdgesIn,
//float* pfrh_energy, // Temporary entry for debugging
int* pfrh_parent,
int* pfrh_edgeId,
int* pfrh_edgeList,
Expand Down Expand Up @@ -3937,18 +3915,6 @@ namespace PFClusterCudaHCAL {

// __syncthreads();

// // Print out debugging info
// if (threadIdx.x == 0) {
// // for (int idx = 0; idx < nEdges; idx++) {
// // printf("initial edge id, list, mask: %d %d %d\n",pfrh_edgeId[idx],pfrh_edgeList[idx],pfrh_edgeMask[idx]);
// // //printf("initial edge id, list, mask: %d %d\n",pfrh_edgeId[idx],pfrh_edgeList[idx]);
// // }
// printf("number of eges %d\n",nEdges);
// // for (int i = 0; i < nRH; i++) {
// // printf("initial pfrh_id,parent,energy: %d %d %8.3f\n",i,pfrh_parent[i],pfrh_energy[i]);
// // }
// }

// Explicitly initialize pfrh_parent
for (int i = start; i < nRH; i += gridStride) {
pfrh_parent[i] = i;
Expand Down Expand Up @@ -4005,19 +3971,7 @@ namespace PFClusterCudaHCAL {

__syncthreads();

// Print out debugging info
// Connect remaining links
// if (threadIdx.x == 0) {
// int nnode=0;
// for (int i = 0; i < nRH; i++) {
// //printf("middle pfrh_id,parent: %d %d\n",i,pfrh_parent[i]);
// if (i==pfrh_parent[i]) nnode++;
// }
// printf("pfrh_parent multiplicity: %d\n",nnode);
// }

// __syncthreads();

// All rechit pairs in edge id-list have the same topo cluster label?
for (int idx = start; idx < nEdges; idx += gridStride) {
//for (int idx = 0; idx < nEdges; idx++) {
int i = pfrh_edgeId[idx]; // Get edge topo id
Expand Down Expand Up @@ -4062,18 +4016,6 @@ namespace PFClusterCudaHCAL {

} while (notDone);

// __syncthreads();

// // Print out debugging info
// if (threadIdx.x == 0) {
// int nnode=0;
// for (int i = 0; i < nRH; i++) {
// //printf("middle2 pfrh_id,parent: %d %d\n",i,pfrh_parent[i]);
// if (i==pfrh_parent[i]) nnode++;
// }
// printf("pfrh_parent 2 multiplicity: %d\n",nnode);
// }

}

__device__ __forceinline__ void sortSwap(int* toSort, int a, int b) {
Expand Down Expand Up @@ -4133,14 +4075,17 @@ namespace PFClusterCudaHCAL {
}

__device__ __forceinline__ int scan1Inclusive(int idata, volatile int* s_Data, int size) {
assert(size == 32);
int pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
s_Data[pos] = 0;
pos += size;
s_Data[pos] = idata;

for (int offset = 1; offset < size; offset <<= 1) {
int t = s_Data[pos] + s_Data[pos - offset];
__syncwarp();
s_Data[pos] = t;
__syncwarp();
}

return s_Data[pos];
Expand Down Expand Up @@ -4630,18 +4575,9 @@ namespace PFClusterCudaHCAL {
::PFClustering::HCAL::ScratchDataGPU& scratchGPU,
float (&timer)[8]) {

#ifdef DEBUG_GPU_HCAL
cudaProfilerStart();
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, cudaStream);
#endif

int nRH = inputPFRecHits.size;

// Combined seeding & topo clustering thresholds, array initialization

seedingTopoThreshKernel_HCAL<<<(nRH + 31) / 32, 64, 0, cudaStream>>>(nRH,
inputPFRecHits.pfrh_energy.get(),
inputPFRecHits.pfrh_x.get(),
Expand All @@ -4661,79 +4597,31 @@ namespace PFClusterCudaHCAL {
outputGPU.topoSeedList.get(),
outputGPU.pfc_iter.get());

cudaCheck(cudaStreamSynchronize(cudaStream));

#ifdef DEBUG_GPU_HCAL
cudaEventRecord(stop, cudaStream);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timer[0], start, stop);
cudaEventRecord(start, cudaStream);
#endif

// prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>(
// nRH,
// outputGPU.nEdges.get(),
// outputGPU.pfrh_passTopoThresh.get(),
// inputPFRecHits.pfrh_neighbours.get(),
// scratchGPU.pfrh_edgeId.get(),
// scratchGPU.pfrh_edgeList.get());

// Topo clustering
// Fill edgeId, edgeList arrays with rechit neighbors
// Has a bug when using more than 128 threads..
// prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>(
prepareTopoInputs<<<1, 128, 128 * (8 + 4) * sizeof(int), cudaStream>>>(nRH,
outputGPU.nEdges.get(),
outputGPU.pfrh_passTopoThresh.get(),
inputPFRecHits.pfrh_neighbours.get(),
scratchGPU.pfrh_edgeId.get(),
scratchGPU.pfrh_edgeList.get());
cudaCheck(cudaStreamSynchronize(cudaStream));

// prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>(
// nRH,
// outputGPU.nEdges.get(),
// outputGPU.pfrh_passTopoThresh.get(),
// inputPFRecHits.pfrh_neighbours.get(),
// scratchGPU.pfrh_edgeId.get(),
// scratchGPU.pfrh_edgeList.get());

#ifdef DEBUG_GPU_HCAL
cudaEventRecord(stop, cudaStream);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timer[4], start, stop);
//printf("\nprepareTopoInputs took %f ms\n", timer[4]);

compareEdgeArrays<<<1, 1, 0, cudaStream>>>(outputGPU.nEdges.get(),
scratchGPU.pfrh_edgeId.get(),
scratchGPU.pfrh_edgeList.get(),
nEdges,
inputGPU.pfrh_edgeId.get(),
inputGPU.pfrh_edgeList.get(),
nRH,
inputGPU.pfNeighFourInd.get(),
inputPFRecHits.pfrh_neighbours.get());

cudaEventRecord(start, cudaStream);
#endif

// Topo clustering
//topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH,
topoClusterLinkingKH<<<1, 512, 0, cudaStream>>>(nRH,
outputGPU.nEdges.get(),
//inputPFRecHits.pfrh_energy.get(), // temporary entry for debugging
outputGPU.pfrh_topoId.get(),
scratchGPU.pfrh_edgeId.get(),
scratchGPU.pfrh_edgeList.get(),
scratchGPU.pfrh_edgeMask.get(),
//inputGPU.pfrh_edgeMask.get(),
outputGPU.pfrh_passTopoThresh.get(),
outputGPU.topoIter.get());
cudaCheck(cudaStreamSynchronize(cudaStream));

topoClusterContraction<<<1, 512, 0, cudaStream>>>(nRH,
outputGPU.pfrh_topoId.get(),
outputGPU.pfrh_isSeed.get(),
//inputPFRecHits.pfrh_neighbours.get(), // temporary entry for debugging
scratchGPU.rhcount.get(),
outputGPU.topoSeedCount.get(),
outputGPU.topoRHCount.get(),
Expand All @@ -4744,13 +4632,6 @@ namespace PFClusterCudaHCAL {
outputGPU.pcrh_frac.get(),
outputGPU.pcrhFracSize.get());

#ifdef DEBUG_GPU_HCAL
cudaEventRecord(stop, cudaStream);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timer[1], start, stop);
cudaEventRecord(start, cudaStream);
#endif

dim3 grid((nRH + 31) / 32, (nRH + 31) / 32);
dim3 block(32, 32);

Expand All @@ -4763,13 +4644,6 @@ namespace PFClusterCudaHCAL {
scratchGPU.rhcount.get(),
outputGPU.pcrh_fracInd.get());

#ifdef DEBUG_GPU_HCAL
cudaEventRecord(stop, cudaStream);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timer[2], start, stop);
cudaEventRecord(start, cudaStream);
#endif

hcalFastCluster_selection<<<nRH, 256, 0, cudaStream>>>(nRH,
inputPFRecHits.pfrh_x.get(),
inputPFRecHits.pfrh_y.get(),
Expand All @@ -4793,11 +4667,5 @@ namespace PFClusterCudaHCAL {
inputGPU.pfc_prevPos4.get(),
inputGPU.pfc_energy.get(),
outputGPU.pfc_iter.get());
#ifdef DEBUG_GPU_HCAL
cudaEventRecord(stop, cudaStream);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&timer[3], start, stop);
cudaProfilerStop();
#endif
}
} // namespace PFClusterCudaHCAL

0 comments on commit cd7994d

Please sign in to comment.