diff --git a/examples/fodo_programmable/run_fodo_programmable.py b/examples/fodo_programmable/run_fodo_programmable.py index 8899c7ea3..33ecc8f03 100755 --- a/examples/fodo_programmable/run_fodo_programmable.py +++ b/examples/fodo_programmable/run_fodo_programmable.py @@ -117,7 +117,6 @@ def my_ref_drift(pge, refpart): pge1.beam_particles = lambda pti, refpart: my_drift(pge1, pti, refpart) pge1.ref_particle = lambda refpart: my_ref_drift(pge1, refpart) pge1.ds = 0.25 -pge1.threadsafe = True # allow OpenMP threading for speed # attention: assignment is a reference for pge2 = pge1 @@ -126,7 +125,6 @@ def my_ref_drift(pge, refpart): pge2.beam_particles = lambda pti, refpart: my_drift(pge2, pti, refpart) pge2.ref_particle = lambda refpart: my_ref_drift(pge2, refpart) pge2.ds = 0.5 -pge2.threadsafe = True # allow OpenMP threading for speed # add beam diagnostics monitor = elements.BeamMonitor("monitor", backend="h5") diff --git a/src/ImpactX.cpp b/src/ImpactX.cpp index 3a2796c52..eca17e2c3 100644 --- a/src/ImpactX.cpp +++ b/src/ImpactX.cpp @@ -94,13 +94,11 @@ namespace impactx { // init blocks / grids & MultiFabs amr_data->InitFromScratch(0.0); - // alloc particle containers - // have to resize here, not in the constructor because grids have not + // prepare particle containers + // have to do this here, not in the constructor because grids have not // been built when constructor was called. - amr_data->track_particles.m_particle_container->reserveData(); - amr_data->track_particles.m_particle_container->resizeData(); - amr_data->track_particles.m_particles_lost->reserveData(); - amr_data->track_particles.m_particles_lost->resizeData(); + amr_data->track_particles.m_particle_container->prepare(); + amr_data->track_particles.m_particles_lost->prepare(); // register shortcut amr_data->track_particles.m_particle_container->SetLostParticleContainer(amr_data->track_particles.m_particles_lost.get()); diff --git a/src/particles/ImpactXParticleContainer.H b/src/particles/ImpactXParticleContainer.H index 27fdebc10..00f518370 100644 --- a/src/particles/ImpactXParticleContainer.H +++ b/src/particles/ImpactXParticleContainer.H @@ -143,6 +143,13 @@ namespace impactx //! Destruct a particle container virtual ~ImpactXParticleContainer() = default; + /** Prepares the container for use. + * + * This sets the tile size to a sensible default and resizes the + * underlying data structures. + */ + void prepare (); + /** Add new particles to the container for fixed s. * * Note: This can only be used *after* the initialization (grids) have diff --git a/src/particles/ImpactXParticleContainer.cpp b/src/particles/ImpactXParticleContainer.cpp index 7b9dc3c1a..306e8098d 100644 --- a/src/particles/ImpactXParticleContainer.cpp +++ b/src/particles/ImpactXParticleContainer.cpp @@ -13,6 +13,7 @@ #include #include +#include #include #include @@ -107,6 +108,61 @@ namespace impactx SetParticleShape(v); } + void + ImpactXParticleContainer::prepare () + { + // make sure we have at least one box with enough tiles for each OpenMP thread + + // make sure level 0, grid 0 exists + int lid = 0, gid = 0; + { + const auto& pmap = ParticleDistributionMap(lid).ProcessorMap(); + auto it = std::find(pmap.begin(), pmap.end(), amrex::ParallelDescriptor::MyProc()); + if (it == std::end(pmap)) { + amrex::Abort("Particle container needs to have at least one grid."); + } else { + gid = *it; + } + } + + int nthreads = 1; +#if defined(AMREX_USE_OMP) + nthreads = omp_get_max_threads(); +#endif + + // When running without space charge and OpenMP parallelization, + // we need to make sure we have enough tiles on level 0, grid 0 + // to thread over the available tiles. We try to set the tile_size + // appropriately here. The tiles start as (very large number, 8, 8) + // in (x, y, z). So we try to reduce the tile size in the y and z + // directions alternately until there are enough tiles for the number of threads. + + const auto& ba = ParticleBoxArray(lid); + auto n_logical = numTilesInBox(ba[gid], true, tile_size); + + int ntry = 0; + constexpr int max_tries = 6; + while ((n_logical < nthreads) && (ntry++ < max_tries)) { + int idim = (ntry % 2) + 1; // alternate between 1 and 2 + tile_size[idim] /= 2; + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(tile_size[idim] > 0, + "Failed to set proper tile size for the number of OpenMP threads. " + "Consider lowering the number of OpenMP threads via the environment variable OMP_NUM_THREADS." + ); + n_logical = numTilesInBox(ba[gid], true, tile_size); + } + + if (n_logical < nthreads) { + amrex::Abort("ImpactParticleContainer::prepare() " + "could not find good tile size for the number of OpenMP threads. " + "Consider lowering the number of OpenMP threads via the environment variable OMP_NUM_THREADS." + ); + } + + reserveData(); + resizeData(); + } + void ImpactXParticleContainer::AddNParticles ( amrex::Gpu::DeviceVector const & x, @@ -130,8 +186,8 @@ namespace impactx // number of particles to add int const np = x.size(); - // we add particles to lev 0, tile 0 of the first box assigned to this proc - int lid = 0, gid = 0, tid = 0; + // we add particles to lev 0, grid 0 + int lid = 0, gid = 0; { const auto& pmap = ParticleDistributionMap(lid).ProcessorMap(); auto it = std::find(pmap.begin(), pmap.end(), amrex::ParallelDescriptor::MyProc()); @@ -141,61 +197,96 @@ namespace impactx gid = *it; } } - auto& particle_tile = DefineAndReturnParticleTile(lid, gid, tid); - auto old_np = particle_tile.numParticles(); - auto new_np = old_np + np; - particle_tile.resize(new_np); - - // Update NextID to include particles created in this function - int pid; -#ifdef AMREX_USE_OMP -#pragma omp critical (add_beam_nextid) + int nthreads = 1; +#if defined(AMREX_USE_OMP) + nthreads = omp_get_max_threads(); #endif - { - pid = ParticleType::NextID(); - ParticleType::NextID(pid+np); + + // split up particles over nthreads tiles + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(numTilesInBox(ParticleBoxArray(lid)[gid], true, tile_size) >= nthreads, "Not enough tiles for the number of OpenMP threads - please try reducing particles.tile_size or increasing the number of cells in the domain."); + for (int ithr = 0; ithr < nthreads; ++ithr) { + DefineAndReturnParticleTile(lid, gid, ithr); } + + int pid = ParticleType::NextID(); + ParticleType::NextID(pid + np); AMREX_ALWAYS_ASSERT_WITH_MESSAGE( - static_cast(pid) + static_cast(np) < amrex::LongParticleIds::LastParticleID, - "ERROR: overflow on particle id numbers"); - - const int cpuid = amrex::ParallelDescriptor::MyProc(); - - auto & soa = particle_tile.GetStructOfArrays().GetRealData(); - amrex::ParticleReal * const AMREX_RESTRICT x_arr = soa[RealSoA::x].dataPtr(); - amrex::ParticleReal * const AMREX_RESTRICT y_arr = soa[RealSoA::y].dataPtr(); - amrex::ParticleReal * const AMREX_RESTRICT t_arr = soa[RealSoA::t].dataPtr(); - amrex::ParticleReal * const AMREX_RESTRICT px_arr = soa[RealSoA::px].dataPtr(); - amrex::ParticleReal * const AMREX_RESTRICT py_arr = soa[RealSoA::py].dataPtr(); - amrex::ParticleReal * const AMREX_RESTRICT pt_arr = soa[RealSoA::pt].dataPtr(); - amrex::ParticleReal * const AMREX_RESTRICT qm_arr = soa[RealSoA::qm].dataPtr(); - amrex::ParticleReal * const AMREX_RESTRICT w_arr = soa[RealSoA::w ].dataPtr(); - - uint64_t * const AMREX_RESTRICT idcpu_arr = particle_tile.GetStructOfArrays().GetIdCPUData().dataPtr(); - - amrex::ParticleReal const * const AMREX_RESTRICT x_ptr = x.data(); - amrex::ParticleReal const * const AMREX_RESTRICT y_ptr = y.data(); - amrex::ParticleReal const * const AMREX_RESTRICT t_ptr = t.data(); - amrex::ParticleReal const * const AMREX_RESTRICT px_ptr = px.data(); - amrex::ParticleReal const * const AMREX_RESTRICT py_ptr = py.data(); - amrex::ParticleReal const * const AMREX_RESTRICT pt_ptr = pt.data(); - - amrex::ParallelFor(np, - [=] AMREX_GPU_DEVICE (int i) noexcept + static_cast(pid) + static_cast(np) < amrex::LongParticleIds::LastParticleID, + "ERROR: overflow on particle id numbers" + ); + +#if defined(AMREX_USE_OMP) +#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) +#endif { - idcpu_arr[old_np+i] = amrex::SetParticleIDandCPU(pid + i, cpuid); - - x_arr[old_np+i] = x_ptr[i]; - y_arr[old_np+i] = y_ptr[i]; - t_arr[old_np+i] = t_ptr[i]; - - px_arr[old_np+i] = px_ptr[i]; - py_arr[old_np+i] = py_ptr[i]; - pt_arr[old_np+i] = pt_ptr[i]; - qm_arr[old_np+i] = qm; - w_arr[old_np+i] = bchchg/ablastr::constant::SI::q_e/np; - }); + int tid = 1; +#if defined(AMREX_USE_OMP) + tid = omp_get_thread_num(); +#endif + + // we split up the np particles onto multiple tiles. + // if nthreads evenly divides np, each thread will + // get get n_regular particles. If there are some + // leftovers, however, the first n_leftover threads + // will get one extra + int n_regular = np / nthreads; + int n_leftover = np - n_regular*nthreads; + + int num_to_add = 0; /* how many particles this tile will get*/ + int my_offset = 0; /* offset into global arrays x, y, etc. for this thread*/ + + if (tid < n_leftover) { // get n_regular+1 items + my_offset = tid * (n_regular + 1); + num_to_add = n_regular + 1; + } else { // get n_regular items + my_offset = tid * n_regular + n_leftover; + num_to_add = n_regular; + } + + auto& particle_tile = ParticlesAt(lid, gid, tid); + auto old_np = particle_tile.numParticles(); + auto new_np = old_np + num_to_add; + particle_tile.resize(new_np); + + const int cpuid = amrex::ParallelDescriptor::MyProc(); + + auto & soa = particle_tile.GetStructOfArrays().GetRealData(); + amrex::ParticleReal * const AMREX_RESTRICT x_arr = soa[RealSoA::x].dataPtr(); + amrex::ParticleReal * const AMREX_RESTRICT y_arr = soa[RealSoA::y].dataPtr(); + amrex::ParticleReal * const AMREX_RESTRICT t_arr = soa[RealSoA::t].dataPtr(); + amrex::ParticleReal * const AMREX_RESTRICT px_arr = soa[RealSoA::px].dataPtr(); + amrex::ParticleReal * const AMREX_RESTRICT py_arr = soa[RealSoA::py].dataPtr(); + amrex::ParticleReal * const AMREX_RESTRICT pt_arr = soa[RealSoA::pt].dataPtr(); + amrex::ParticleReal * const AMREX_RESTRICT qm_arr = soa[RealSoA::qm].dataPtr(); + amrex::ParticleReal * const AMREX_RESTRICT w_arr = soa[RealSoA::w ].dataPtr(); + + uint64_t * const AMREX_RESTRICT idcpu_arr = particle_tile.GetStructOfArrays().GetIdCPUData().dataPtr(); + + amrex::ParticleReal const * const AMREX_RESTRICT x_ptr = x.data(); + amrex::ParticleReal const * const AMREX_RESTRICT y_ptr = y.data(); + amrex::ParticleReal const * const AMREX_RESTRICT t_ptr = t.data(); + amrex::ParticleReal const * const AMREX_RESTRICT px_ptr = px.data(); + amrex::ParticleReal const * const AMREX_RESTRICT py_ptr = py.data(); + amrex::ParticleReal const * const AMREX_RESTRICT pt_ptr = pt.data(); + + amrex::ParallelFor(num_to_add, + [=] AMREX_GPU_DEVICE (int i) noexcept + { + idcpu_arr[old_np+i] = amrex::SetParticleIDandCPU(pid + my_offset + i, cpuid); + + x_arr[old_np+i] = x_ptr[my_offset+i]; + y_arr[old_np+i] = y_ptr[my_offset+i]; + t_arr[old_np+i] = t_ptr[my_offset+i]; + + px_arr[old_np+i] = px_ptr[my_offset+i]; + py_arr[old_np+i] = py_ptr[my_offset+i]; + pt_arr[old_np+i] = pt_ptr[my_offset+i]; + + qm_arr[old_np+i] = qm; + w_arr[old_np+i] = bchchg/ablastr::constant::SI::q_e/np; + }); + } // safety first: in case passed attribute arrays were temporary, we // want to make sure the ParallelFor has ended here