diff --git a/examples/fodo_programmable/run_fodo_programmable.py b/examples/fodo_programmable/run_fodo_programmable.py
index 8899c7ea3..33ecc8f03 100755
--- a/examples/fodo_programmable/run_fodo_programmable.py
+++ b/examples/fodo_programmable/run_fodo_programmable.py
@@ -117,7 +117,6 @@ def my_ref_drift(pge, refpart):
 pge1.beam_particles = lambda pti, refpart: my_drift(pge1, pti, refpart)
 pge1.ref_particle = lambda refpart: my_ref_drift(pge1, refpart)
 pge1.ds = 0.25
-pge1.threadsafe = True  # allow OpenMP threading for speed
 
 # attention: assignment is a reference for pge2 = pge1
 
@@ -126,7 +125,6 @@ def my_ref_drift(pge, refpart):
 pge2.beam_particles = lambda pti, refpart: my_drift(pge2, pti, refpart)
 pge2.ref_particle = lambda refpart: my_ref_drift(pge2, refpart)
 pge2.ds = 0.5
-pge2.threadsafe = True  # allow OpenMP threading for speed
 
 # add beam diagnostics
 monitor = elements.BeamMonitor("monitor", backend="h5")
diff --git a/src/ImpactX.cpp b/src/ImpactX.cpp
index 3a2796c52..eca17e2c3 100644
--- a/src/ImpactX.cpp
+++ b/src/ImpactX.cpp
@@ -94,13 +94,11 @@ namespace impactx {
         // init blocks / grids & MultiFabs
         amr_data->InitFromScratch(0.0);
 
-        // alloc particle containers
-        //   have to resize here, not in the constructor because grids have not
+        // prepare particle containers
+        //   have to do this here, not in the constructor because grids have not
         //   been built when constructor was called.
-        amr_data->track_particles.m_particle_container->reserveData();
-        amr_data->track_particles.m_particle_container->resizeData();
-        amr_data->track_particles.m_particles_lost->reserveData();
-        amr_data->track_particles.m_particles_lost->resizeData();
+        amr_data->track_particles.m_particle_container->prepare();
+        amr_data->track_particles.m_particles_lost->prepare();
 
         // register shortcut
         amr_data->track_particles.m_particle_container->SetLostParticleContainer(amr_data->track_particles.m_particles_lost.get());
diff --git a/src/particles/ImpactXParticleContainer.H b/src/particles/ImpactXParticleContainer.H
index 27fdebc10..00f518370 100644
--- a/src/particles/ImpactXParticleContainer.H
+++ b/src/particles/ImpactXParticleContainer.H
@@ -143,6 +143,13 @@ namespace impactx
         //! Destruct a particle container
         virtual ~ImpactXParticleContainer() = default;
 
+        /** Prepares the container for use.
+         *
+         * This sets the tile size to a sensible default and resizes the
+         * underlying data structures.
+         */
+        void prepare ();
+
         /** Add new particles to the container for fixed s.
          *
          * Note: This can only be used *after* the initialization (grids) have
diff --git a/src/particles/ImpactXParticleContainer.cpp b/src/particles/ImpactXParticleContainer.cpp
index 7b9dc3c1a..306e8098d 100644
--- a/src/particles/ImpactXParticleContainer.cpp
+++ b/src/particles/ImpactXParticleContainer.cpp
@@ -13,6 +13,7 @@
 
 #include <ablastr/constant.H>
 #include <ablastr/particles/ParticleMoments.H>
+#include <ablastr/warn_manager/WarnManager.H>
 
 #include <AMReX.H>
 #include <AMReX_AmrCore.H>
@@ -107,6 +108,61 @@ namespace impactx
         SetParticleShape(v);
     }
 
+    void
+    ImpactXParticleContainer::prepare ()
+    {
+        // make sure we have at least one box with enough tiles for each OpenMP thread
+
+        // make sure level 0, grid 0 exists
+        int lid = 0, gid = 0;
+        {
+            const auto& pmap = ParticleDistributionMap(lid).ProcessorMap();
+            auto it = std::find(pmap.begin(), pmap.end(), amrex::ParallelDescriptor::MyProc());
+            if (it == std::end(pmap)) {
+                amrex::Abort("Particle container needs to have at least one grid.");
+            } else {
+                gid = *it;
+            }
+        }
+
+        int nthreads = 1;
+#if defined(AMREX_USE_OMP)
+        nthreads = omp_get_max_threads();
+#endif
+
+        // When running without space charge and OpenMP parallelization,
+        // we need to make sure we have enough tiles on level 0, grid 0
+        // to thread over the available tiles. We try to set the tile_size
+        // appropriately here. The tiles start as (very large number, 8, 8)
+        // in (x, y, z). So we try to reduce the tile size in the y and z
+        // directions alternately until there are enough tiles for the number of threads.
+
+        const auto& ba = ParticleBoxArray(lid);
+        auto n_logical = numTilesInBox(ba[gid], true, tile_size);
+
+        int ntry = 0;
+        constexpr int max_tries = 6;
+        while ((n_logical < nthreads) && (ntry++ < max_tries)) {
+            int idim = (ntry % 2) + 1;  // alternate between 1 and 2
+            tile_size[idim] /= 2;
+            AMREX_ALWAYS_ASSERT_WITH_MESSAGE(tile_size[idim] > 0,
+                                             "Failed to set proper tile size for the number of OpenMP threads. "
+                                             "Consider lowering the number of OpenMP threads via the environment variable OMP_NUM_THREADS."
+                                             );
+            n_logical = numTilesInBox(ba[gid], true, tile_size);
+        }
+
+        if (n_logical < nthreads) {
+            amrex::Abort("ImpactParticleContainer::prepare() "
+                "could not find good tile size for the number of OpenMP threads. "
+                "Consider lowering the number of OpenMP threads via the environment variable OMP_NUM_THREADS."
+            );
+        }
+
+        reserveData();
+        resizeData();
+    }
+
     void
     ImpactXParticleContainer::AddNParticles (
         amrex::Gpu::DeviceVector<amrex::ParticleReal> const & x,
@@ -130,8 +186,8 @@ namespace impactx
         // number of particles to add
         int const np = x.size();
 
-        // we add particles to lev 0, tile 0 of the first box assigned to this proc
-        int lid = 0, gid = 0, tid = 0;
+        // we add particles to lev 0, grid 0
+        int lid = 0, gid = 0;
         {
             const auto& pmap = ParticleDistributionMap(lid).ProcessorMap();
             auto it = std::find(pmap.begin(), pmap.end(), amrex::ParallelDescriptor::MyProc());
@@ -141,61 +197,96 @@ namespace impactx
                 gid = *it;
             }
         }
-        auto& particle_tile = DefineAndReturnParticleTile(lid, gid, tid);
 
-        auto old_np = particle_tile.numParticles();
-        auto new_np = old_np + np;
-        particle_tile.resize(new_np);
-
-        // Update NextID to include particles created in this function
-        int pid;
-#ifdef AMREX_USE_OMP
-#pragma omp critical (add_beam_nextid)
+        int nthreads = 1;
+#if defined(AMREX_USE_OMP)
+        nthreads = omp_get_max_threads();
 #endif
-        {
-            pid = ParticleType::NextID();
-            ParticleType::NextID(pid+np);
+
+        // split up particles over nthreads tiles
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(numTilesInBox(ParticleBoxArray(lid)[gid], true, tile_size) >= nthreads, "Not enough tiles for the number of OpenMP threads - please try reducing particles.tile_size or increasing the number of cells in the domain.");
+        for (int ithr = 0; ithr < nthreads; ++ithr) {
+            DefineAndReturnParticleTile(lid, gid, ithr);
         }
+
+        int pid = ParticleType::NextID();
+        ParticleType::NextID(pid + np);
         AMREX_ALWAYS_ASSERT_WITH_MESSAGE(
-        static_cast<amrex::Long>(pid) + static_cast<amrex::Long>(np) < amrex::LongParticleIds::LastParticleID,
-            "ERROR: overflow on particle id numbers");
-
-        const int cpuid = amrex::ParallelDescriptor::MyProc();
-
-        auto & soa = particle_tile.GetStructOfArrays().GetRealData();
-        amrex::ParticleReal * const AMREX_RESTRICT x_arr = soa[RealSoA::x].dataPtr();
-        amrex::ParticleReal * const AMREX_RESTRICT y_arr = soa[RealSoA::y].dataPtr();
-        amrex::ParticleReal * const AMREX_RESTRICT t_arr = soa[RealSoA::t].dataPtr();
-        amrex::ParticleReal * const AMREX_RESTRICT px_arr = soa[RealSoA::px].dataPtr();
-        amrex::ParticleReal * const AMREX_RESTRICT py_arr = soa[RealSoA::py].dataPtr();
-        amrex::ParticleReal * const AMREX_RESTRICT pt_arr = soa[RealSoA::pt].dataPtr();
-        amrex::ParticleReal * const AMREX_RESTRICT qm_arr = soa[RealSoA::qm].dataPtr();
-        amrex::ParticleReal * const AMREX_RESTRICT w_arr  = soa[RealSoA::w ].dataPtr();
-
-        uint64_t * const AMREX_RESTRICT idcpu_arr = particle_tile.GetStructOfArrays().GetIdCPUData().dataPtr();
-
-        amrex::ParticleReal const * const AMREX_RESTRICT x_ptr = x.data();
-        amrex::ParticleReal const * const AMREX_RESTRICT y_ptr = y.data();
-        amrex::ParticleReal const * const AMREX_RESTRICT t_ptr = t.data();
-        amrex::ParticleReal const * const AMREX_RESTRICT px_ptr = px.data();
-        amrex::ParticleReal const * const AMREX_RESTRICT py_ptr = py.data();
-        amrex::ParticleReal const * const AMREX_RESTRICT pt_ptr = pt.data();
-
-        amrex::ParallelFor(np,
-        [=] AMREX_GPU_DEVICE (int i) noexcept
+            static_cast<amrex::Long>(pid) + static_cast<amrex::Long>(np) < amrex::LongParticleIds::LastParticleID,
+            "ERROR: overflow on particle id numbers"
+        );
+
+#if defined(AMREX_USE_OMP)
+#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
+#endif
         {
-            idcpu_arr[old_np+i] = amrex::SetParticleIDandCPU(pid + i, cpuid);
-
-            x_arr[old_np+i] = x_ptr[i];
-            y_arr[old_np+i] = y_ptr[i];
-            t_arr[old_np+i] = t_ptr[i];
-
-            px_arr[old_np+i] = px_ptr[i];
-            py_arr[old_np+i] = py_ptr[i];
-            pt_arr[old_np+i] = pt_ptr[i];
-            qm_arr[old_np+i] = qm;
-            w_arr[old_np+i]  = bchchg/ablastr::constant::SI::q_e/np;
-        });
+            int tid = 1;
+#if defined(AMREX_USE_OMP)
+            tid = omp_get_thread_num();
+#endif
+
+            // we split up the np particles onto multiple tiles.
+            // if nthreads evenly divides np, each thread will
+            // get get n_regular particles. If there are some
+            // leftovers, however, the first n_leftover threads
+            // will get one extra
+            int n_regular  = np / nthreads;
+            int n_leftover = np - n_regular*nthreads;
+
+            int num_to_add = 0; /* how many particles this tile will get*/
+            int my_offset = 0; /*  offset into global arrays x, y, etc. for this thread*/
+
+            if (tid < n_leftover) { // get n_regular+1 items
+                my_offset = tid * (n_regular + 1);
+                num_to_add = n_regular + 1;
+            } else {         // get n_regular items
+                my_offset = tid * n_regular + n_leftover;
+                num_to_add = n_regular;
+            }
+
+            auto& particle_tile = ParticlesAt(lid, gid, tid);
+            auto old_np = particle_tile.numParticles();
+            auto new_np = old_np + num_to_add;
+            particle_tile.resize(new_np);
+
+            const int cpuid = amrex::ParallelDescriptor::MyProc();
+
+            auto & soa = particle_tile.GetStructOfArrays().GetRealData();
+            amrex::ParticleReal * const AMREX_RESTRICT x_arr = soa[RealSoA::x].dataPtr();
+            amrex::ParticleReal * const AMREX_RESTRICT y_arr = soa[RealSoA::y].dataPtr();
+            amrex::ParticleReal * const AMREX_RESTRICT t_arr = soa[RealSoA::t].dataPtr();
+            amrex::ParticleReal * const AMREX_RESTRICT px_arr = soa[RealSoA::px].dataPtr();
+            amrex::ParticleReal * const AMREX_RESTRICT py_arr = soa[RealSoA::py].dataPtr();
+            amrex::ParticleReal * const AMREX_RESTRICT pt_arr = soa[RealSoA::pt].dataPtr();
+            amrex::ParticleReal * const AMREX_RESTRICT qm_arr = soa[RealSoA::qm].dataPtr();
+            amrex::ParticleReal * const AMREX_RESTRICT w_arr  = soa[RealSoA::w ].dataPtr();
+
+            uint64_t * const AMREX_RESTRICT idcpu_arr = particle_tile.GetStructOfArrays().GetIdCPUData().dataPtr();
+
+            amrex::ParticleReal const * const AMREX_RESTRICT x_ptr = x.data();
+            amrex::ParticleReal const * const AMREX_RESTRICT y_ptr = y.data();
+            amrex::ParticleReal const * const AMREX_RESTRICT t_ptr = t.data();
+            amrex::ParticleReal const * const AMREX_RESTRICT px_ptr = px.data();
+            amrex::ParticleReal const * const AMREX_RESTRICT py_ptr = py.data();
+            amrex::ParticleReal const * const AMREX_RESTRICT pt_ptr = pt.data();
+
+            amrex::ParallelFor(num_to_add,
+                [=] AMREX_GPU_DEVICE (int i) noexcept
+            {
+                idcpu_arr[old_np+i] = amrex::SetParticleIDandCPU(pid + my_offset + i, cpuid);
+
+                x_arr[old_np+i] = x_ptr[my_offset+i];
+                y_arr[old_np+i] = y_ptr[my_offset+i];
+                t_arr[old_np+i] = t_ptr[my_offset+i];
+
+                px_arr[old_np+i] = px_ptr[my_offset+i];
+                py_arr[old_np+i] = py_ptr[my_offset+i];
+                pt_arr[old_np+i] = pt_ptr[my_offset+i];
+
+                qm_arr[old_np+i] = qm;
+                w_arr[old_np+i]  = bchchg/ablastr::constant::SI::q_e/np;
+            });
+        }
 
         // safety first: in case passed attribute arrays were temporary, we
         // want to make sure the ParallelFor has ended here