Merge pull request #9 from hatakeyamak/PFRecHitAndCluster_GPU_12_5_ha…

…ckason2_tmp Integrate developments from Marino
jsamudio · Oct 4, 2022 · cd7994d · cd7994d
2 parents 7f2d2b7 + 6ca17e5
commit cd7994d
Showing 1 changed file with 18 additions and 150 deletions.
diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
@@ -3524,7 +3524,6 @@ namespace PFClusterCudaHCAL {
   __global__ void topoClusterContraction(size_t size,
                                          int* pfrh_parent,
                                          int* pfrh_isSeed,
-                                         //const int* pfrh_neighbours, // temporary inputs for debuggi
                                          int* rhCount,
                                          int* topoSeedCount,
                                          int* topoRHCount,
@@ -3561,30 +3560,6 @@ namespace PFClusterCudaHCAL {
 
     } while (notDone);
 
-    // // debugging printing block
-    // __syncthreads();
-    // if (threadIdx.x == 0) {
-    //   int nnode=0;
-    //   for (int i = 0; i < size; i++) {
-    // 	//printf("final pfrh_id,parent: %d %d\n",i,pfrh_parent[i]);
-    // 	if (i==pfrh_parent[i]) nnode++;
-    //   }
-    //   printf("pfrh_parent 3 multiplicity: %d\n",nnode);
-    //   for (int pos = 0; pos < size; pos++) {
-    // 	int parent_target = pfrh_parent[pos];
-    // 	for (int i = 0; i < 8; i++) {
-    // 	  int neighbor_id = pfrh_neighbours[pos * 8 + i];
-    // 	  if (neighbor_id>-1){ // valid neighbors
-    // 	    int parent_neighbor = pfrh_parent[neighbor_id];
-    // 	    if (parent_target!=parent_neighbor){
-    // 	      printf("hmm. they should have the same parent, but they don't. why... %d %d\n",pos,neighbor_id);
-    // 	    }
-    // 	  }
-    // 	}
-    //   }
-    //   }
-    // __syncthreads();
-
     // Now determine the number of seeds and rechits in each topo cluster
     for (int rhIdx = threadIdx.x; rhIdx < size; rhIdx += blockDim.x) {
       int topoId = pfrh_parent[rhIdx];
@@ -3635,6 +3610,7 @@ namespace PFClusterCudaHCAL {
       }
     }
     __syncthreads();
+
     if (threadIdx.x == 0) {
       *pcrhFracSize = totalSeedFracOffset;
       if (*pcrhFracSize>200000) // DeclsForKernels.h maxPFCFracs
@@ -3807,8 +3783,11 @@ namespace PFClusterCudaHCAL {
       else
         pfrh_edgeMask[idx] = 0;
     }
+    __syncthreads();//!!
 
     do {
+      __syncthreads();//!!
+
       if (threadIdx.x == 0) {
         notDone = 0;
       }
@@ -3840,6 +3819,9 @@ namespace PFClusterCudaHCAL {
           }
         }
       }
+
+      __syncthreads();//!!
+
       if (threadIdx.x == 0)
         iter++;
 
@@ -3853,7 +3835,6 @@ namespace PFClusterCudaHCAL {
       if (threadIdx.x == 0) {
         notDone = 0;
       }
-
       __syncthreads();
 
       // Even linking
@@ -3864,7 +3845,6 @@ namespace PFClusterCudaHCAL {
           pfrh_parent[i] = (int)max(i, pfrh_edgeList[idx]);
         }
       }
-
       __syncthreads();
 
       // edgeParent
@@ -3885,25 +3865,23 @@ namespace PFClusterCudaHCAL {
         }
       }
 
+      __syncthreads();//!!
+
       if (threadIdx.x == 0)
         iter++;
 
       __syncthreads();
 
     } while (notDone==1);
 
-    *topoIter = iter;
-#ifdef DEBUG_GPU_HCAL
-//    if (threadIdx.x == 0) {
-//        printf("*** Topo clustering converged in %d iterations ***\n", iter);
-//    }
-//    __syncthreads();
-#endif
+    __syncthreads();//!!
+
+    if (threadIdx.x == 0)
+      *topoIter = iter;
   }
 
   __global__ void topoClusterLinkingKH(int nRH,
                                        int* nEdgesIn,
-                                       //float* pfrh_energy, // Temporary entry for debugging
                                        int* pfrh_parent,
                                        int* pfrh_edgeId,
                                        int* pfrh_edgeList,
@@ -3937,18 +3915,6 @@ namespace PFClusterCudaHCAL {
 
     // __syncthreads();
 
-    // // Print out debugging info
-    // if (threadIdx.x == 0) {
-    //   // for (int idx = 0; idx < nEdges; idx++) {
-    //   // 	printf("initial edge id, list, mask: %d %d %d\n",pfrh_edgeId[idx],pfrh_edgeList[idx],pfrh_edgeMask[idx]);
-    //   // 	//printf("initial edge id, list, mask: %d %d\n",pfrh_edgeId[idx],pfrh_edgeList[idx]);
-    //   //  }
-    //   printf("number of eges %d\n",nEdges);
-    //   // for (int i = 0; i < nRH; i++) {
-    //   // 	printf("initial pfrh_id,parent,energy: %d %d %8.3f\n",i,pfrh_parent[i],pfrh_energy[i]);
-    //   // }
-    // }
-
     // Explicitly initialize pfrh_parent
     for (int i = start; i < nRH; i += gridStride) {
       pfrh_parent[i] = i;
@@ -4005,19 +3971,7 @@ namespace PFClusterCudaHCAL {
 
       __syncthreads();
 
-      // Print out debugging info
-      // Connect remaining links
-      // if (threadIdx.x == 0) {
-      // 	int nnode=0;
-      // 	for (int i = 0; i < nRH; i++) {
-      // 	  //printf("middle pfrh_id,parent: %d %d\n",i,pfrh_parent[i]);
-      // 	  if (i==pfrh_parent[i]) nnode++;
-      // 	}
-      // 	printf("pfrh_parent multiplicity: %d\n",nnode);
-      // }
-
-      // __syncthreads();
-
+      // All rechit pairs in edge id-list have the same topo cluster label?
       for (int idx = start; idx < nEdges; idx += gridStride) {
       	//for (int idx = 0; idx < nEdges; idx++) {
       	int i = pfrh_edgeId[idx];    // Get edge topo id
@@ -4062,18 +4016,6 @@ namespace PFClusterCudaHCAL {
 
     } while (notDone);
 
-    // __syncthreads();
-
-    // // Print out debugging info
-    // if (threadIdx.x == 0) {
-    //   int nnode=0;
-    //   for (int i = 0; i < nRH; i++) {
-    // 	//printf("middle2 pfrh_id,parent: %d %d\n",i,pfrh_parent[i]);
-    // 	if (i==pfrh_parent[i]) nnode++;
-    //   }
-    //   printf("pfrh_parent 2 multiplicity: %d\n",nnode);
-    // }
-
   }
 
   __device__ __forceinline__ void sortSwap(int* toSort, int a, int b) {
@@ -4133,14 +4075,17 @@ namespace PFClusterCudaHCAL {
   }
 
   __device__ __forceinline__ int scan1Inclusive(int idata, volatile int* s_Data, int size) {
+    assert(size == 32);
     int pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
     s_Data[pos] = 0;
     pos += size;
     s_Data[pos] = idata;
 
     for (int offset = 1; offset < size; offset <<= 1) {
       int t = s_Data[pos] + s_Data[pos - offset];
+      __syncwarp();
       s_Data[pos] = t;
+      __syncwarp();
     }
 
     return s_Data[pos];
@@ -4630,18 +4575,9 @@ namespace PFClusterCudaHCAL {
       ::PFClustering::HCAL::ScratchDataGPU& scratchGPU,
       float (&timer)[8]) {
 
-#ifdef DEBUG_GPU_HCAL
-    cudaProfilerStart();
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-    cudaEventRecord(start, cudaStream);
-#endif
-
     int nRH = inputPFRecHits.size;
 
     // Combined seeding & topo clustering thresholds, array initialization
-
     seedingTopoThreshKernel_HCAL<<<(nRH + 31) / 32, 64, 0, cudaStream>>>(nRH,
                                                                          inputPFRecHits.pfrh_energy.get(),
                                                                          inputPFRecHits.pfrh_x.get(),
@@ -4661,79 +4597,31 @@ namespace PFClusterCudaHCAL {
                                                                          outputGPU.topoSeedList.get(),
                                                                          outputGPU.pfc_iter.get());
 
-    cudaCheck(cudaStreamSynchronize(cudaStream));
-
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[0], start, stop);
-    cudaEventRecord(start, cudaStream);
-#endif
-
-       // prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>(
-       //     nRH,
-       //     outputGPU.nEdges.get(),
-       //     outputGPU.pfrh_passTopoThresh.get(),
-       //     inputPFRecHits.pfrh_neighbours.get(),
-       //     scratchGPU.pfrh_edgeId.get(),
-       //     scratchGPU.pfrh_edgeList.get());
-
     // Topo clustering
     // Fill edgeId, edgeList arrays with rechit neighbors
     // Has a bug when using more than 128 threads..
+    // prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>(
     prepareTopoInputs<<<1, 128, 128 * (8 + 4) * sizeof(int), cudaStream>>>(nRH,
                                                                            outputGPU.nEdges.get(),
                                                                            outputGPU.pfrh_passTopoThresh.get(),
                                                                            inputPFRecHits.pfrh_neighbours.get(),
                                                                            scratchGPU.pfrh_edgeId.get(),
                                                                            scratchGPU.pfrh_edgeList.get());
-    cudaCheck(cudaStreamSynchronize(cudaStream));
-
-    //    prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>(
-    //        nRH,
-    //        outputGPU.nEdges.get(),
-    //        outputGPU.pfrh_passTopoThresh.get(),
-    //        inputPFRecHits.pfrh_neighbours.get(),
-    //        scratchGPU.pfrh_edgeId.get(),
-    //        scratchGPU.pfrh_edgeList.get());
-
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[4], start, stop);
-    //printf("\nprepareTopoInputs took %f ms\n", timer[4]);
-
-    compareEdgeArrays<<<1, 1, 0, cudaStream>>>(outputGPU.nEdges.get(),
-                                               scratchGPU.pfrh_edgeId.get(),
-                                               scratchGPU.pfrh_edgeList.get(),
-                                               nEdges,
-                                               inputGPU.pfrh_edgeId.get(),
-                                               inputGPU.pfrh_edgeList.get(),
-                                               nRH,
-                                               inputGPU.pfNeighFourInd.get(),
-                                               inputPFRecHits.pfrh_neighbours.get());
-
-    cudaEventRecord(start, cudaStream);
-#endif
 
     // Topo clustering
     //topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH,
     topoClusterLinkingKH<<<1, 512, 0, cudaStream>>>(nRH,
 						    outputGPU.nEdges.get(),
-						    //inputPFRecHits.pfrh_energy.get(), // temporary entry for debugging
 						    outputGPU.pfrh_topoId.get(),
 						    scratchGPU.pfrh_edgeId.get(),
 						    scratchGPU.pfrh_edgeList.get(),
 						    scratchGPU.pfrh_edgeMask.get(),
-						    //inputGPU.pfrh_edgeMask.get(),
 						    outputGPU.pfrh_passTopoThresh.get(),
 						    outputGPU.topoIter.get());
-    cudaCheck(cudaStreamSynchronize(cudaStream));
 
     topoClusterContraction<<<1, 512, 0, cudaStream>>>(nRH,
                                                       outputGPU.pfrh_topoId.get(),
                                                       outputGPU.pfrh_isSeed.get(),
-                                                      //inputPFRecHits.pfrh_neighbours.get(), // temporary entry for debugging
                                                       scratchGPU.rhcount.get(),
                                                       outputGPU.topoSeedCount.get(),
                                                       outputGPU.topoRHCount.get(),
@@ -4744,13 +4632,6 @@ namespace PFClusterCudaHCAL {
                                                       outputGPU.pcrh_frac.get(),
                                                       outputGPU.pcrhFracSize.get());
 
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[1], start, stop);
-    cudaEventRecord(start, cudaStream);
-#endif
-
     dim3 grid((nRH + 31) / 32, (nRH + 31) / 32);
     dim3 block(32, 32);
 
@@ -4763,13 +4644,6 @@ namespace PFClusterCudaHCAL {
                                                  scratchGPU.rhcount.get(),
                                                  outputGPU.pcrh_fracInd.get());
 
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[2], start, stop);
-    cudaEventRecord(start, cudaStream);
-#endif
-
     hcalFastCluster_selection<<<nRH, 256, 0, cudaStream>>>(nRH,
                                                            inputPFRecHits.pfrh_x.get(),
                                                            inputPFRecHits.pfrh_y.get(),
@@ -4793,11 +4667,5 @@ namespace PFClusterCudaHCAL {
                                                            inputGPU.pfc_prevPos4.get(),
                                                            inputGPU.pfc_energy.get(),
                                                            outputGPU.pfc_iter.get());
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[3], start, stop);
-    cudaProfilerStop();
-#endif
   }
 }  // namespace PFClusterCudaHCAL