Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

patch from cherry-pick of commit 7bfd16f from PR279 to remove OpenMP,… #280

Merged
merged 1 commit into from
Oct 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions Makefile.config
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ ifdef AVX_512
VEC_GCC := -mavx512f -mavx512cd # -march=native -fopt-info-vec -mavx
VEC_ICC := -xHost -qopt-zmm-usage=high #-march=native -mtune=native #-xcore-avx512
else ifdef AVX2
VEC_GCC := -mavx2
VEC_ICC := -mavx2
VEC_GCC := -mavx2 -mfma
VEC_ICC := -mavx2 -mfma
else
VEC_GCC := -mavx # -fopt-info-vec-all
VEC_ICC := -mavx
Expand Down Expand Up @@ -150,11 +150,11 @@ ifdef USE_CUDA
CPPFLAGS += -DUSE_CUDA -I${CUBROOT} -I${CUDAINCDIR} #-g -G -lineinfo
LDFLAGS_HOST += -L${CUDALIBDIR}
ifeq ($(CXX),icpc)
CXXFLAGS += -qopenmp
LDFLAGS += -qopenmp
CXXFLAGS += -qopenmp-simd
LDFLAGS += -qopenmp-simd
else
CXXFLAGS += -fopenmp
LDFLAGS += -fopenmp
CXXFLAGS += -fopenmp-simd
LDFLAGS += -fopenmp-simd
endif
endif
#CXXFLAGS += -qopenmp
Expand All @@ -177,14 +177,14 @@ endif

ifeq (${CXX}, ${ICC})
VEC_HOST := ${VEC_ICC}
CXXFLAGS += -qopt-report=5 -qopenmp -qopt-report-phase=all
CXXFLAGS += -qopt-report=5 -qopenmp-simd -qopt-report-phase=all
else
VEC_HOST := ${VEC_GCC}
endif

ifeq ($(CXX), g++)
CXXFLAGS += -std=c++1z -ftree-vectorize -Werror=main -Werror=pointer-arith -Werror=overlength-strings -Wno-vla -Werror=overflow -Wstrict-overflow -Werror=array-bounds -Werror=format-contains-nul -Werror=type-limits -fvisibility-inlines-hidden -fno-math-errno --param vect-max-version-for-alias-checks=50 -Xassembler --compress-debug-sections -felide-constructors -fmessage-length=0 -Wall -Wno-non-template-friend -Wno-long-long -Wreturn-type -Wunused -Wparentheses -Wno-deprecated -Werror=return-type -Werror=missing-braces -Werror=unused-value -Werror=address -Werror=format -Werror=sign-compare -Werror=write-strings -Werror=delete-non-virtual-dtor -Wstrict-aliasing -Werror=narrowing -Werror=unused-but-set-variable -Werror=reorder -Werror=unused-variable -Werror=conversion-null -Werror=return-local-addr -Wnon-virtual-dtor -Werror=switch -fdiagnostics-show-option -Wno-unused-local-typedefs -Wno-attributes -Wno-psabi
CXXFLAGS += -fdiagnostics-color=always -fdiagnostics-show-option -pthread -pipe -fopenmp
CXXFLAGS += -fdiagnostics-color=always -fdiagnostics-show-option -pthread -pipe -fopenmp-simd
endif

ifdef WITH_USOLIDS
Expand Down
5 changes: 3 additions & 2 deletions Matriplex/MatriplexCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,15 @@
#define MUL(a, b) _mm512_mul_ps(a, b)
#define FMA(a, b, v) _mm512_fmadd_ps(a, b, v)

#elif defined(__AVX2__)
#elif defined(__AVX2__) && defined(__FMA__)

typedef __m256 IntrVec_t;
#define MPLEX_INTRINSICS_WIDTH_BYTES 32
#define MPLEX_INTRINSICS_WIDTH_BITS 256
#define AVX2_INTRINSICS
#define GATHER_INTRINSICS
#define GATHER_IDX_LOAD(name, arr) __m256i name = _mm256_load_epi32(arr);
// Previously used _mm256_load_epi32(arr) here, but that's part of AVX-512F, not AVX2
#define GATHER_IDX_LOAD(name, arr) __m256i name = _mm256_load_si256(reinterpret_cast<const __m256i *>(arr));

#define LD(a, i) _mm256_load_ps(&a[i*N+n])
#define ST(a, i, r) _mm256_store_ps(&a[i*N+n], r)
Expand Down
4 changes: 2 additions & 2 deletions mkFit/FitterCU-imp.h
Original file line number Diff line number Diff line change
Expand Up @@ -392,8 +392,8 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
#if 0
double time_input = dtime();
int itrack;
omp_set_num_threads(Config::numThreadsReorg);
#pragma omp parallel for
//omp_set_num_threads(Config::numThreadsReorg);
//#pragma omp parallel for
for (int i = beg; i < end; ++i) {
itrack = i - beg;
Track &trk = tracks[i];
Expand Down
2 changes: 1 addition & 1 deletion mkFit/FitterCU.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include "index_selection_kernels.h"
#include "best_hit_kernels.h"

#include <omp.h>
//#include <omp.h>
#include <stdexcept>

#define BLOCK_SIZE_X 256
Expand Down
21 changes: 9 additions & 12 deletions mkFit/MkFitter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,14 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>& tracks,
int itrack = 0;

// FIXME: uncomment when track building is ported to GPU.
#if USE_CUDA_NOT_YET
//#ifdef USE_CUDA
// This openmp loop brings some performances when using
// a single thread to fit all events.
// However, it is more advantageous to use the threads to
// parallelize over Events.
omp_set_num_threads(Config::numThreadsReorg);
#pragma omp parallel for private(itrack)
#endif
// omp_set_num_threads(Config::numThreadsReorg);
//#pragma omp parallel for private(itrack)
//#endif
for (int i = beg; i < end; ++i, ++itrack)
{
const Track &trk = tracks[i];
Expand Down Expand Up @@ -121,14 +120,13 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>& tracks,

int itrack;
//#ifdef USE_CUDA
#if 0
// This openmp loop brings some performances when using
// a single thread to fit all events.
// However, it is more advantageous to use the threads to
// parallelize over Events.
omp_set_num_threads(Config::numThreadsReorg);
#pragma omp parallel for private(itrack)
#endif
// omp_set_num_threads(Config::numThreadsReorg);
//#pragma omp parallel for private(itrack)
//#endif
for (int i = beg; i < end; ++i) {
itrack = i - beg;
const Track &trk = tracks[i];
Expand Down Expand Up @@ -173,14 +171,13 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>& tracks,
MatriplexTrackPacker mtp(tracks[beg]);

//#ifdef USE_CUDA
#if 0
// This openmp loop brings some performances when using
// a single thread to fit all events.
// However, it is more advantageous to use the threads to
// parallelize over Events.
omp_set_num_threads(Config::numThreadsReorg);
#pragma omp parallel for private(itrack)
#endif
// omp_set_num_threads(Config::numThreadsReorg);
//#pragma omp parallel for private(itrack)
//#endif
for (int i = beg; i < end; ++i)
{
int itrack = i - beg;
Expand Down
21 changes: 12 additions & 9 deletions mkFit/fittestMPlex.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#if USE_CUDA
#include "fittestMPlex.h"
#include "FitterCU.h"
#include <omp.h>
//#include <omp.h>
#endif

#ifndef NO_ROOT
Expand Down Expand Up @@ -142,15 +142,18 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
#endif
separate_first_call_for_meaningful_profiling_numbers();

// Reorgnanization (copyIn) can eventually be multithreaded.
omp_set_nested(1);
// Reorganization (copyIn) can eventually be multithreaded.
// FIXME: revisit multithreading when track building is ported to GPU.
// omp_set_nested(1);

omp_set_num_threads(Config::numThreadsEvents);
// omp_set_num_threads(Config::numThreadsEvents);
double total_gpu_time = dtime();
#pragma omp parallel reduction(+:s_tmp)
//#pragma omp parallel reduction(+:s_tmp)
{
int numThreadsEvents = omp_get_num_threads();
int thr_idx = omp_get_thread_num();
// int numThreadsEvents = omp_get_num_threads();
// int thr_idx = omp_get_thread_num();
int numThreadsEvents = 1;
int thr_idx = 0;

// FitterCU is declared here to share allocations and deallocations
// between the multiple events processed by a single thread.
Expand All @@ -177,11 +180,11 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
#if 0 // 0 for timing, 1 for validation
// Validation crashes for multiple threads.
// It is something in relation to ROOT. Not sure what.
if (omp_get_num_threads() <= 1) {
//if (omp_get_num_threads() <= 1) {
//if (g_run_fit_std) {
std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root";
//}
}
//}
#endif
}
cuFitter.free_extra_addBestHit();
Expand Down