diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.cc index 017300076edb9..9c731ccf0d52b 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.cc @@ -71,7 +71,7 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks, std::vector hits_and_covariances; float * hits_and_covariancesGPU = nullptr; - std::vector helix_fit_results; + Rfit::helix_fit * helix_fit_results = nullptr; Rfit::helix_fit * helix_fit_resultsGPU = nullptr; const int points_in_seed = 4; @@ -101,26 +101,24 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks, } // We pretend to have one fit for every seed - helix_fit_results.resize(total_seeds); - - cudaCheck(cudaMalloc((void**)&hits_and_covariancesGPU, sizeof(float)*hits_and_covariances.size())); - cudaCheck(cudaMalloc((void**)&helix_fit_resultsGPU, sizeof(Rfit::helix_fit)*helix_fit_results.size())); + cudaCheck(cudaMallocHost(&helix_fit_results, sizeof(Rfit::helix_fit)*total_seeds)); + cudaCheck(cudaMalloc(&hits_and_covariancesGPU, sizeof(float)*hits_and_covariances.size())); + cudaCheck(cudaMalloc(&helix_fit_resultsGPU, sizeof(Rfit::helix_fit)*total_seeds)); + cudaCheck(cudaMemset(helix_fit_resultsGPU, 0x00, sizeof(Rfit::helix_fit)*total_seeds )); // CUDA MALLOC OF HITS AND COV AND HELIX_FIT RESULTS // CUDA MEMCOPY HOST2DEVICE OF HITS AND COVS AND HELIX_FIT RESULTS cudaCheck(cudaMemcpy(hits_and_covariancesGPU, hits_and_covariances.data(), - sizeof(float)*hits_and_covariances.size(), cudaMemcpyHostToDevice)); + sizeof(float)*hits_and_covariances.size(), cudaMemcpyDefault)); // LAUNCH THE KERNEL FIT launchKernelFit(hits_and_covariancesGPU, 12*4*total_seeds, 4, bField, helix_fit_resultsGPU); // CUDA MEMCOPY DEVICE2HOST OF HELIX_FIT cudaCheck(cudaDeviceSynchronize()); -// cudaCheck(cudaMemcpy(helix_fit_results.data(), helix_fit_resultsGPU, -// sizeof(Rfit::helix_fit)*helix_fit_results.size(), cudaMemcpyDeviceToHost)); cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(helix_fit_results.data(), helix_fit_resultsGPU, - sizeof(Rfit::helix_fit)*helix_fit_results.size(), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(helix_fit_results, helix_fit_resultsGPU, + sizeof(Rfit::helix_fit)*total_seeds, cudaMemcpyDefault)); cudaCheck(cudaFree(hits_and_covariancesGPU)); cudaCheck(cudaFree(helix_fit_resultsGPU)); @@ -137,7 +135,11 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks, const TrackingRegion& region = regionHitSets.region(); for(const SeedingHitSet& tuplet: regionHitSets) { auto nHits = tuplet.size(); hits.resize(nHits); - for (unsigned int iHit = 0; iHit < nHits; ++iHit) hits[iHit] = tuplet[iHit]; + + for (unsigned int iHit = 0; iHit < nHits; ++iHit) + { + hits[iHit] = tuplet[iHit]; + } auto const &fittedTrack = helix_fit_results[counter]; counter++; int iCharge = fittedTrack.q; @@ -146,7 +148,7 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks, float valPt = fittedTrack.par(2); float valCotTheta = fittedTrack.par(3); float valZip = fittedTrack.par(4); - // + // PixelTrackErrorParam param(valEta, valPt); float errValPhi = std::sqrt(fittedTrack.cov(0, 0)); float errValTip = std::sqrt(fittedTrack.cov(1, 1)); @@ -178,7 +180,9 @@ void PixelTrackReconstructionGPU::run(TracksWithTTRHs& tracks, } } - // skip ovelrapped tracks + cudaCheck(cudaFreeHost(helix_fit_results)); + + // skip overlapped tracks if(!theCleanerName.empty()) { edm::ESHandle hcleaner; es.get().get(theCleanerName, hcleaner); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.cu b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.cu index 3ebda8ce94ba8..5391f2cd5f979 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.cu +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.cu @@ -4,18 +4,20 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "PixelTrackReconstructionGPU.h" -#ifndef GPU_DEBUG -#define GPU_DEBUG 0 -#endif // GPU_DEBUG - using namespace Eigen; -__global__ -void KernelFullFitAllHits(float * hits_and_covariances, +__global__ void +KernelFastFitAllHits(float *hits_and_covariances, int hits_in_fit, int cumulative_size, - double B, - Rfit::helix_fit * results) { + float B, + Rfit::helix_fit *results, + Rfit::Matrix3xNd *hits, + Rfit::Matrix3Nd *hits_cov, + Rfit::circle_fit *circle_fit, + Vector4d *fast_fit, + Rfit::line_fit *line_fit) +{ // Reshape Eigen components from hits_and_covariances, using proper thread and block indices // Perform the fit // Store the results in the proper vector, using again correct indices @@ -24,101 +26,187 @@ void KernelFullFitAllHits(float * hits_and_covariances, // first 3 are the points // the rest is the covariance matrix, 3x3 int start = (blockIdx.x * blockDim.x + threadIdx.x) * hits_in_fit * 12; - int helix_start = (blockIdx.x * blockDim.x + threadIdx.x); + int helix_start = (blockIdx.x * blockDim.x + threadIdx.x); if (start >= cumulative_size) { return; } -#if GPU_DEBUG +#ifdef GPU_DEBUG printf("BlockDim.x: %d, BlockIdx.x: %d, threadIdx.x: %d, start: %d, cumulative_size: %d\n", blockDim.x, blockIdx.x, threadIdx.x, start, cumulative_size); #endif - Rfit::Matrix3xNd hits(3,hits_in_fit); - Rfit::Matrix3Nd hits_cov(3 * hits_in_fit, 3 * hits_in_fit); + hits[helix_start].resize(3, hits_in_fit); + hits_cov[helix_start].resize(3 * hits_in_fit, 3 * hits_in_fit); // Prepare data structure (stack) for (unsigned int i = 0; i < hits_in_fit; ++i) { - hits.col(i) << hits_and_covariances[start], - hits_and_covariances[start+1],hits_and_covariances[start+2]; + hits[helix_start].col(i) << hits_and_covariances[start], + hits_and_covariances[start + 1], hits_and_covariances[start + 2]; start += 3; for (auto j = 0; j < 3; ++j) { for (auto l = 0; l < 3; ++l) { - hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = hits_and_covariances[start]; + hits_cov[helix_start](i + j * hits_in_fit, i + l * hits_in_fit) = + hits_and_covariances[start]; start++; } } } -#if GPU_DEBUG - printf("KernelFullFitAllHits hits(0,0): %d\t%f\n", helix_start, hits(0,0)); - printf("KernelFullFitAllHits hits(0,1): %d\t%f\n", helix_start, hits(0,1)); - printf("KernelFullFitAllHits hits(0,2): %d\t%f\n", helix_start, hits(0,2)); - printf("KernelFullFitAllHits hits(0,3): %d\t%f\n", helix_start, hits(0,3)); - printf("KernelFullFitAllHits hits(1,0): %d\t%f\n", helix_start, hits(1,0)); - printf("KernelFullFitAllHits hits(1,1): %d\t%f\n", helix_start, hits(1,1)); - printf("KernelFullFitAllHits hits(1,2): %d\t%f\n", helix_start, hits(1,2)); - printf("KernelFullFitAllHits hits(1,3): %d\t%f\n", helix_start, hits(1,3)); - printf("KernelFullFitAllHits hits(2,0): %d\t%f\n", helix_start, hits(2,0)); - printf("KernelFullFitAllHits hits(2,1): %d\t%f\n", helix_start, hits(2,1)); - printf("KernelFullFitAllHits hits(2,2): %d\t%f\n", helix_start, hits(2,2)); - printf("KernelFullFitAllHits hits(2,3): %d\t%f\n", helix_start, hits(2,3)); - Rfit::printIt(&hits); - Rfit::printIt(&hits_cov); -#endif + fast_fit[helix_start] = Rfit::Fast_fit(hits[helix_start]); +} - // Perform actual fit - Vector4d fast_fit = Rfit::Fast_fit(hits); +__global__ void +KernelCircleFitAllHits(float *hits_and_covariances, int hits_in_fit, + int cumulative_size, float B, Rfit::helix_fit *results, + Rfit::Matrix3xNd *hits, Rfit::Matrix3Nd *hits_cov, + Rfit::circle_fit *circle_fit, Vector4d *fast_fit, + Rfit::line_fit *line_fit) +{ + // Reshape Eigen components from hits_and_covariances, using proper thread and block indices + // Perform the fit + // Store the results in the proper vector, using again correct indices -#if GPU_DEBUG - printf("KernelFullFitAllHits fast_fit(0): %d %f\n", helix_start, fast_fit(0)); - printf("KernelFullFitAllHits fast_fit(1): %d %f\n", helix_start, fast_fit(1)); - printf("KernelFullFitAllHits fast_fit(2): %d %f\n", helix_start, fast_fit(2)); - printf("KernelFullFitAllHits fast_fit(3): %d %f\n", helix_start, fast_fit(3)); -#endif + // Loop for hits_in_fit times: + // first 3 are the points + // the rest is the covariance matrix, 3x3 + int start = (blockIdx.x * blockDim.x + threadIdx.x) * hits_in_fit * 12; + int helix_start = (blockIdx.x * blockDim.x + threadIdx.x); + if (start >= cumulative_size) { + return; + } - u_int n = hits.cols(); -#if GPU_DEBUG - printf("KernelFullFitAllHits using %d hits: %d\n", n, helix_start); +#ifdef GPU_DEBUG + printf("BlockDim.x: %d, BlockIdx.x: %d, threadIdx.x: %d, start: %d, " + "cumulative_size: %d\n", + blockDim.x, blockIdx.x, threadIdx.x, start, cumulative_size); +#endif + u_int n = hits[helix_start].cols(); + + Rfit::VectorNd rad = (hits[helix_start].block(0, 0, 2, n).colwise().norm()); + + circle_fit[helix_start] = + Rfit::Circle_fit(hits[helix_start].block(0, 0, 2, n), + hits_cov[helix_start].block(0, 0, 2 * n, 2 * n), + fast_fit[helix_start], rad, B, true, true); + +#ifdef GPU_DEBUG + printf("KernelCircleFitAllHits circle.par(0): %d %f\n", helix_start, + circle_fit[helix_start].par(0)); + printf("KernelCircleFitAllHits circle.par(1): %d %f\n", helix_start, + circle_fit[helix_start].par(1)); + printf("KernelCircleFitAllHits circle.par(2): %d %f\n", helix_start, + circle_fit[helix_start].par(2)); #endif - Rfit::VectorNd rad = (hits.block(0, 0, 2, n).colwise().norm()); +} + +__global__ void +KernelLineFitAllHits(float *hits_and_covariances, int hits_in_fit, + int cumulative_size, float B, Rfit::helix_fit *results, + Rfit::Matrix3xNd *hits, Rfit::Matrix3Nd *hits_cov, + Rfit::circle_fit *circle_fit, Vector4d *fast_fit, + Rfit::line_fit *line_fit) +{ + // Reshape Eigen components from hits_and_covariances, using proper thread and block indices + // Perform the fit + // Store the results in the proper vector, using again correct indices + + // Loop for hits_in_fit times: + // first 3 are the points + // the rest is the covariance matrix, 3x3 + int start = (blockIdx.x * blockDim.x + threadIdx.x) * hits_in_fit * 12; + int helix_start = (blockIdx.x * blockDim.x + threadIdx.x); + if (start >= cumulative_size) { + return; + } - Rfit::circle_fit circle = - Rfit::Circle_fit(hits.block(0,0,2,n), hits_cov.block(0, 0, 2 * n, 2 * n), - fast_fit, rad, B, true, true); +#ifdef GPU_DEBUG -#if GPU_DEBUG - printf("KernelFullFitAllHits circle.par(0): %d %f\n", helix_start, circle.par(0)); - printf("KernelFullFitAllHits circle.par(1): %d %f\n", helix_start, circle.par(1)); - printf("KernelFullFitAllHits circle.par(2): %d %f\n", helix_start, circle.par(2)); + printf("BlockDim.x: %d, BlockIdx.x: %d, threadIdx.x: %d, start: %d, " + "cumulative_size: %d\n", + blockDim.x, blockIdx.x, threadIdx.x, start, cumulative_size); #endif - Rfit::line_fit line = Rfit::Line_fit(hits, hits_cov, circle, fast_fit, true); + line_fit[helix_start] = + Rfit::Line_fit(hits[helix_start], hits_cov[helix_start], + circle_fit[helix_start], fast_fit[helix_start], true); - par_uvrtopak(circle, B, true); + par_uvrtopak(circle_fit[helix_start], B, true); // Grab helix_fit from the proper location in the output vector Rfit::helix_fit &helix = results[helix_start]; - helix.par << circle.par, line.par; + helix.par << circle_fit[helix_start].par, line_fit[helix_start].par; + // TODO: pass properly error booleans - if (1) { - helix.cov = MatrixXd::Zero(5, 5); - helix.cov.block(0, 0, 3, 3) = circle.cov; - helix.cov.block(3, 3, 2, 2) = line.cov; - } - helix.q = circle.q; - helix.chi2_circle = circle.chi2; - helix.chi2_line = line.chi2; + + helix.cov = MatrixXd::Zero(5, 5); + helix.cov.block(0, 0, 3, 3) = circle_fit[helix_start].cov; + helix.cov.block(3, 3, 2, 2) = line_fit[helix_start].cov; + + helix.q = circle_fit[helix_start].q; + helix.chi2_circle = circle_fit[helix_start].chi2; + helix.chi2_line = line_fit[helix_start].chi2; + +#ifdef GPU_DEBUG + + printf("KernelLineFitAllHits line.par(0): %d %f\n", helix_start, + circle_fit[helix_start].par(0)); + printf("KernelLineFitAllHits line.par(1): %d %f\n", helix_start, + line_fit[helix_start].par(1)); +#endif } -void PixelTrackReconstructionGPU::launchKernelFit(float * hits_and_covariancesGPU, - int cumulative_size, int hits_in_fit, float B, Rfit::helix_fit * results) { - const dim3 threads_per_block(32,1); - // We need to partition data in blocks of: - // 12(3+9) * hits_in_fit - int num_blocks = cumulative_size/(hits_in_fit*12)/threads_per_block.x + 1; - KernelFullFitAllHits<<>>(hits_and_covariancesGPU, hits_in_fit, cumulative_size, B, results); +void PixelTrackReconstructionGPU::launchKernelFit( + float *hits_and_covariancesGPU, int cumulative_size, int hits_in_fit, + float B, Rfit::helix_fit *results) +{ + const dim3 threads_per_block(32, 1); + int num_blocks = cumulative_size / (hits_in_fit * 12) / threads_per_block.x + 1; + auto numberOfSeeds = cumulative_size / (hits_in_fit * 12); + + Rfit::Matrix3xNd *hitsGPU; + cudaCheck(cudaMalloc(&hitsGPU, 48 * numberOfSeeds * sizeof(Rfit::Matrix3xNd(3, 4)))); + cudaCheck(cudaMemset(hitsGPU, 0x00, 48 * numberOfSeeds * sizeof(Rfit::Matrix3xNd(3, 4)))); + + Rfit::Matrix3Nd *hits_covGPU = nullptr; + cudaCheck(cudaMalloc(&hits_covGPU, 48 * numberOfSeeds * sizeof(Rfit::Matrix3Nd(12, 12)))); + cudaCheck(cudaMemset(hits_covGPU, 0x00, 48 * numberOfSeeds * sizeof(Rfit::Matrix3Nd(12, 12)))); + + Vector4d *fast_fit_resultsGPU = nullptr; + cudaCheck(cudaMalloc(&fast_fit_resultsGPU, 48 * numberOfSeeds * sizeof(Vector4d))); + cudaCheck(cudaMemset(fast_fit_resultsGPU, 0x00, 48 * numberOfSeeds * sizeof(Vector4d))); + + Rfit::circle_fit *circle_fit_resultsGPU = nullptr; + cudaCheck(cudaMalloc(&circle_fit_resultsGPU, 48 * numberOfSeeds * sizeof(Rfit::circle_fit))); + cudaCheck(cudaMemset(circle_fit_resultsGPU, 0x00, 48 * numberOfSeeds * sizeof(Rfit::circle_fit))); + + Rfit::line_fit *line_fit_resultsGPU = nullptr; + cudaCheck(cudaMalloc(&line_fit_resultsGPU, numberOfSeeds * sizeof(Rfit::line_fit))); + cudaCheck(cudaMemset(line_fit_resultsGPU, 0x00, numberOfSeeds * sizeof(Rfit::line_fit))); + + KernelFastFitAllHits<<>>( + hits_and_covariancesGPU, hits_in_fit, cumulative_size, B, results, + hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU, + line_fit_resultsGPU); cudaCheck(cudaGetLastError()); + + KernelCircleFitAllHits<<>>( + hits_and_covariancesGPU, hits_in_fit, cumulative_size, B, results, + hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU, + line_fit_resultsGPU); + cudaCheck(cudaGetLastError()); + + KernelLineFitAllHits<<>>( + hits_and_covariancesGPU, hits_in_fit, cumulative_size, B, results, + hitsGPU, hits_covGPU, circle_fit_resultsGPU, fast_fit_resultsGPU, + line_fit_resultsGPU); + cudaCheck(cudaGetLastError()); + + cudaFree(hitsGPU); + cudaFree(hits_covGPU); + cudaFree(fast_fit_resultsGPU); + cudaFree(circle_fit_resultsGPU); + cudaFree(line_fit_resultsGPU); } diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.h b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.h index b5085d3ee970d..7488ba43b6f93 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.h +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackReconstructionGPU.h @@ -1,27 +1,30 @@ -#ifndef RecoPixelVertexing_PixelTrackFitting_PixelTrackReconstructionGPU_H -#define RecoPixelVertexing_PixelTrackFitting_PixelTrackReconstructionGPU_H +#ifndef RecoPixelVertexing_PixelTrackFitting_plugins_PixelTrackReconstructionGPU_h +#define RecoPixelVertexing_PixelTrackFitting_plugins_PixelTrackReconstructionGPU_h -#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include "RecoPixelVertexing/PixelTrackFitting/interface/TracksWithHits.h" -#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h" -#include "FWCore/Framework/interface/ConsumesCollector.h" +#include +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Utilities/interface/EDGetToken.h" - -#include +#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/TracksWithHits.h" class PixelFitter; class PixelTrackCleaner; class PixelTrackFilter; class RegionsSeedingHitSets; -namespace edm { class Event; class EventSetup; class Run; class ParameterSetDescription;} +namespace edm { + class Event; + class EventSetup; + class Run; + class ParameterSetDescription; +} class PixelTrackReconstructionGPU { public: - PixelTrackReconstructionGPU( const edm::ParameterSet& conf, - edm::ConsumesCollector && iC); + PixelTrackReconstructionGPU( const edm::ParameterSet& conf, edm::ConsumesCollector && iC); ~PixelTrackReconstructionGPU(); static void fillDescriptions(edm::ParameterSetDescription& desc); @@ -36,5 +39,5 @@ class PixelTrackReconstructionGPU { edm::EDGetTokenT theFilterToken; std::string theCleanerName; }; -#endif +#endif // RecoPixelVertexing_PixelTrackFitting_plugins_PixelTrackReconstructionGPU_h