From dd7e9471a533b71e1233578d227da337fbe85c52 Mon Sep 17 00:00:00 2001
From: Xuchen Han <xuchen.han@tri.global>
Date: Thu, 9 Jan 2025 15:17:50 -0800
Subject: [PATCH 1/2] [mpm] Add ParticleSorter

In addition, SpGrid reports the flags it uses, and add the missing
set_zero() method for GridData.
---
 multibody/mpm/BUILD.bazel                  |  26 ++
 multibody/mpm/grid_data.h                  |  21 +-
 multibody/mpm/particle_sorter.cc           |  27 ++
 multibody/mpm/particle_sorter.h            | 303 +++++++++++++++++++++
 multibody/mpm/spgrid.h                     |  38 ++-
 multibody/mpm/test/grid_data_test.cc       |   6 +-
 multibody/mpm/test/particle_sorter_test.cc | 151 ++++++++++
 multibody/mpm/test/spgrid_test.cc          |   6 +-
 8 files changed, 555 insertions(+), 23 deletions(-)
 create mode 100644 multibody/mpm/particle_sorter.cc
 create mode 100644 multibody/mpm/particle_sorter.h
 create mode 100644 multibody/mpm/test/particle_sorter_test.cc
diff --git a/multibody/mpm/BUILD.bazel b/multibody/mpm/BUILD.bazel
index d233a4d75c44..4ceebdd93b8c 100644
--- a/multibody/mpm/BUILD.bazel
+++ b/multibody/mpm/BUILD.bazel
@@ -17,6 +17,7 @@ drake_cc_package_library(
         ":bspline_weights",
         ":grid_data",
         ":particle_data",
+        ":particle_sorter",
     ],
 )
 
@@ -60,6 +61,21 @@ drake_cc_library(
     ],
 )
 
+drake_cc_library(
+    name = "particle_sorter",
+    srcs = [
+        "particle_sorter.cc",
+    ],
+    hdrs = [
+        "particle_sorter.h",
+    ],
+    deps = [
+        ":bspline_weights",
+        "//common:essential",
+        "//math:gradient",
+    ],
+)
+
 # TODO(xuchenhan-tri): when we enable SPGrid in our releases, we also need to
 # install its license file in drake/tools/workspace/BUILD.bazel.
 
@@ -99,6 +115,16 @@ drake_cc_googletest(
     ],
 )
 
+drake_cc_googletest_linux_only(
+    name = "particle_sorter_test",
+    deps = [
+        ":bspline_weights",
+        ":grid_data",
+        ":particle_sorter",
+        ":spgrid",
+    ],
+)
+
 drake_cc_googletest_linux_only(
     name = "spgrid_test",
     deps = [
diff --git a/multibody/mpm/grid_data.h b/multibody/mpm/grid_data.h
index dcfeba839999..f11053c59752 100644
--- a/multibody/mpm/grid_data.h
+++ b/multibody/mpm/grid_data.h
@@ -108,8 +108,7 @@ struct GridData {
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
                 "T must be float or double.");
 
-  /* Resets `this` GridData to its default state where all floating point values
-   are set to NAN and the index is inactive. */
+  /* Resets all floating point data to zero and the index to be inactive.*/
   void reset() { *this = {}; }
 
   /* Returns true iff `this` GridData is bit-wise equal to `other`. */
@@ -117,24 +116,12 @@ struct GridData {
     return std::memcmp(this, &other, sizeof(GridData<T>)) == 0;
   }
 
-  Vector3<T> v{Vector3<T>::Constant(nan_with_all_bits_set())};
-  T m{nan_with_all_bits_set()};
-  Vector3<T> scratch{Vector3<T>::Constant(nan_with_all_bits_set())};
+  Vector3<T> v{Vector3<T>::Zero()};
+  T m{0};
+  Vector3<T> scratch{Vector3<T>::Zero()};
   std::conditional_t<std::is_same_v<T, float>, IndexOrFlag<int32_t>,
                      IndexOrFlag<int64_t>>
       index_or_flag{};
-
- private:
-  /* Returns a floating point NaN value with all bits set to one. This choice
-   makes the reset() function more efficient. In particlar, it allows the
-   generated machine code to memset all bits to 1 instead of handling each field
-   individually. */
-  static T nan_with_all_bits_set() {
-    using IntType =
-        std::conditional_t<std::is_same_v<T, float>, int32_t, int64_t>;
-    constexpr IntType kAllBitsOn = -1;
-    return drake::internal::bit_cast<T>(kAllBitsOn);
-  }
 };
 
 /* With T = float, GridData is expected to be 32 bytes. With T = double,
diff --git a/multibody/mpm/particle_sorter.cc b/multibody/mpm/particle_sorter.cc
new file mode 100644
index 000000000000..725e31ca779e
--- /dev/null
+++ b/multibody/mpm/particle_sorter.cc
@@ -0,0 +1,27 @@
+#include "drake/multibody/mpm/particle_sorter.h"
+
+namespace drake {
+namespace multibody {
+namespace mpm {
+namespace internal {
+
+void ConvertToRangeVector(const std::vector<int>& data, RangeVector* ranges) {
+  ranges->clear();
+  ranges->reserve(data.size() - 1);
+  for (int i = 1; i < ssize(data); ++i) {
+    ranges->push_back({data[i - 1], data[i]});
+  }
+}
+
+std::vector<uint64_t> ParticleSorter::GetActiveBlockOffsets() const {
+  std::vector<uint64_t> offsets(ssize(sentinel_particles_) - 1);
+  for (int i = 0; i < ssize(sentinel_particles_) - 1; ++i) {
+    offsets[i] = base_node_offsets_[sentinel_particles_[i]];
+  }
+  return offsets;
+}
+
+}  // namespace internal
+}  // namespace mpm
+}  // namespace multibody
+}  // namespace drake
diff --git a/multibody/mpm/particle_sorter.h b/multibody/mpm/particle_sorter.h
new file mode 100644
index 000000000000..5be902337468
--- /dev/null
+++ b/multibody/mpm/particle_sorter.h
@@ -0,0 +1,303 @@
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <bitset>
+#include <vector>
+
+#include "drake/common/drake_assert.h"
+#include "drake/common/eigen_types.h"
+#include "drake/common/ssize.h"
+#include "drake/math/autodiff_gradient.h"
+#include "drake/multibody/mpm/bspline_weights.h"
+
+namespace drake {
+namespace multibody {
+namespace mpm {
+namespace internal {
+
+/* A range of integers [start, end). */
+struct Range {
+  int start;
+  int end;
+};
+
+using RangeVector = std::vector<Range>;
+
+/* Given a sorted vector of integers key points, converts it to a vector of
+ consecutive ranges. For example, given the input {1, 3, 6, 9}, the output
+ should be {{1, 3}, {3, 6}, {6, 9}}.
+ @pre data is sorted in ascending order (no repeats) and has at least two
+ elements.
+ @pre ranges != nullptr */
+void ConvertToRangeVector(const std::vector<int>& data, RangeVector* ranges);
+
+/* ParticleSorter sorts MPM particle data based on their positions within the
+ grid.
+
+ Given a set of MPM particles and the background grid, recall that we define the
+ base node of a particle to be the grid node that is closest to the particle
+ (see multibody::mpm::internal::ComputeBaseNode).
+
+ ParticleSorter provides a `Sort()` function that sorts the particles first
+ based on their base node offsets (recall that the offset of a node is the 1D
+ index of the node; see multibody::mpm::internal::SpGrid for the full
+ definition) in the grid; if they are the same, it then sorts the particles by
+ their original indices. This sorting is useful for efficient data access in the
+ MPM transfer kernels.
+
+ After `Sort()` is called, the ParticleSorter provides information about the
+ particles and its background grid based on the result of the sort. */
+class ParticleSorter {
+ public:
+  DRAKE_DEFAULT_COPY_AND_MOVE_AND_ASSIGN(ParticleSorter);
+  ParticleSorter() = default;
+
+  // TODO(xuchenhan-tri): The implementation of this function can be moved to
+  // the .cc file when SpGrid is no longer an open template.
+  /* Sorts the particles based on the particle positions so that the ordering is
+   more memory friendly with respect to the given SpGrid.
+   @param[in] spgrid             The SPGrid data structure of the MPM grid.
+   @param[in] dx                 The MPM grid spacing [meters].
+   @param[in] particle_positions All particle positions, measured and expressed
+                                 in the world frame. */
+  template <typename T, typename SpGrid>
+  void Sort(const SpGrid& spgrid, double dx,
+            const std::vector<Vector3<T>>& particle_positions) {
+    const auto& flags = spgrid.flags();
+    const int log2_max_grid_size = flags.log2_max_grid_size;
+    const int data_bits = flags.data_bits;
+    const int log2_page = flags.log2_page;
+
+    const int num_particles = particle_positions.size();
+    data_indices_.resize(num_particles);
+    base_node_offsets_.resize(num_particles);
+    particle_sorters_.resize(num_particles);
+
+    // TODO(xuchenhan-tri): Switch to using radix sort to exploit the fact that
+    // our keys are 64 bit unsigned integers. It's proven to be much faster than
+    // std::sort in practice.
+
+    /* We sort particles first based on their base node offsets in the spgrid;
+     if they are the same, then we sort the particles by their original indices.
+     (In other words, we sort the particles using lexigraphical order of the
+     base node offsets and the original indices pair. However, we choose the
+     following strategy to allow for non-comparative sort in the future.) To
+     achieve this, we pack the base node offsets and the original indices of the
+     particles into a single 64 bit unsigned integer as the key for sorting, so
+     that the bit representation of the key is:
+
+         bits_for_base_node_offset | bits_for_particle_index.
+
+     However, that's easier said than done, because the base node offset is
+     already taking 64 bits! Fortunately, we notice that all base node offsets
+     in the spgrid of the particle have the following structure
+
+         page_bits | block_bits | data_bits
+
+     with all the data bits being zero (guaranteed by SPGrid). In addition, a
+     few bits (starting from the left) in the page bits are also zero (we
+     explain why below). Therefore, we can truncate the zero bits in the
+     base_node_offset and squeeze in the original particle index into the lower
+     bits of a single 64 bit unsigned integer.
+
+     Q: Why are the left most bits in page_bits zero?
+
+     A: Since we cap the total number of allowed grid nodes. There are at most
+     2^(3*log2_max_grid_size) number of grid nodes in the grid, which requires
+     3*log2_max_grid_size bits to represent. SPGrid uses page_bits and
+     block_bits to represent the grid nodes (and data_bits are used, well, to
+     store data). The page_bits and block_bits have (64 - data_bits) in total,
+     and only 3*log2_max_grid_size bits are used, so the left most
+     (64 - data_bits - 3*log2_max_grid_size) bits are zero.
+
+     Q: So how many zeros can we truncate in the bits for base_node_offsets to
+     make room for the particle indices? And is it enough room?
+
+     A: There are (64 - data_bits - 3*log2_max_grid_size) zeros on the left and
+     data_bits zeros on the right, so they add up to (64 - 3*log2_max_grid_size)
+     zeros in total. with a typical log2_max_grid_size == 10, we have
+     (64 - 3*10) = 34 bits to work with, which is more than enough to store the
+     32 bit particle indices.
+
+     Therefore, the overall strategy is to left shift the base node offset by
+     (64 - data_bits - 3*log2_max_grid_size), which leaves the lower
+     (64 - 3*log2_max_grid_size) bits to be zero. We then use the right most 32
+     bits to store the original index of the particles. */
+    const int index_bits = 64 - 3 * log2_max_grid_size;
+    // TODO(xuchenhan-tri): The following check (i.e. that we don't have too
+    // many particles) should be enforced when we register the particles to the
+    // MPM model.
+    /* Make sure that we have enough bits to store the index of the particles.
+     */
+    DRAKE_DEMAND(uint64_t(1) << index_bits >
+                 static_cast<uint64_t>(num_particles));
+    const int zero_page_bits = 64 - data_bits - 3 * log2_max_grid_size;
+
+    for (int p = 0; p < num_particles; ++p) {
+      /* We allow T == AutoDiffXd only for testing purpose. The positions in
+       those tests are guaranteed to have zero gradients. Here we cast
+       particle_x to double regardless of what type T is to simplify the code
+       path for computing base node immediately below. */
+      const auto& particle_x = [&]() -> const Vector3<double>& {
+        if constexpr (std::is_same_v<T, double>) {
+          return particle_positions[p];
+        } else if constexpr (std::is_same_v<T, float>) {
+          return particle_positions[p].template cast<double>();
+        } else {
+          return math::DiscardZeroGradient(particle_positions[p]);
+        }
+      }();
+      const Vector3<int> base_node = ComputeBaseNode<double>(particle_x / dx);
+      base_node_offsets_[p] =
+          spgrid.CoordinateToOffset(base_node[0], base_node[1], base_node[2]);
+      data_indices_[p] = p;
+      /* Confirm the data bits of the base node offset are all zero. */
+      DRAKE_ASSERT((base_node_offsets_[p] & ((uint64_t(1) << data_bits) - 1)) ==
+                   0);
+      /* Confirm the left most bits in the page bits are unused. */
+      DRAKE_ASSERT((base_node_offsets_[p] &
+                    ~((uint64_t(1) << (64 - zero_page_bits)) - 1)) == 0);
+      particle_sorters_[p] =
+          (base_node_offsets_[p] << zero_page_bits) + data_indices_[p];
+    }
+    std::sort(particle_sorters_.begin(), particle_sorters_.end());
+
+    /* Peel off the data indices and the base node offsets from
+     particle_sorters_. Meanwhile, reorder the data indices and the base node
+     offsets based on the sorting results. */
+    const uint64_t index_mask = ((uint64_t(1) << index_bits) - 1);
+    for (int p = 0; p < ssize(particle_sorters_); ++p) {
+      data_indices_[p] = particle_sorters_[p] & index_mask;
+      /* Right shift to erase the index bits, and then left shifts to recover
+       the zero data bits in base node offsets (as required by SPGrid). */
+      base_node_offsets_[p] = (particle_sorters_[p] >> index_bits) << data_bits;
+    }
+
+    /* Record the sentinel particles and the coloring of the blocks. */
+    sentinel_particles_.clear();
+    for (int b = 0; b < 8; ++b) {
+      colored_ranges_[b].clear();
+    }
+    uint64_t previous_page{};
+    int range_index = 0;
+    for (int p = 0; p < num_particles; ++p) {
+      /* The bits in the offset are ordered as follows:
+
+        page bits | block bits | data bits
+
+       block bits and data bits add up to log2_page bits.
+       We right shift to get the page bits. If the page bits of two offsets are
+       identical, it means that they belong to the same page.
+
+       We color the pages so that pages that are adjacent in 3D space all have
+       different colors, as described in [Hu et al. 2018]. */
+      const uint64_t page = base_node_offsets_[p] >> log2_page;
+      if (p == 0 || previous_page != page) {
+        previous_page = page;
+        sentinel_particles_.push_back(p);
+        const int color = spgrid.get_color(page);
+        ranges_indices_[color].push_back(range_index++);
+      }
+    }
+    sentinel_particles_.push_back(num_particles);
+
+    ConvertToRangeVector(sentinel_particles_, &ranges_);
+    for (int c = 0; c < 8; ++c) {
+      colored_ranges_[c].clear();
+      for (int r : ranges_indices_[c]) {
+        colored_ranges_[c].push_back(ranges_[r]);
+      }
+    }
+  }
+
+  /* Returns a vector of representative grid node offsets for all active SpGrid
+   blocks.
+
+   A grid block is considered "active" if it contains at least one particle. The
+   returned vector has exactly one offset per active block, allowing the caller
+   to identify or access the blocks that contain particles. */
+  std::vector<uint64_t> GetActiveBlockOffsets() const;
+
+  /* The order in which the particle data is preferably accessed.
+
+    typical usage:
+
+    const std::vector<int>& data_indices = particle_sorter.data_indices();
+    const int num_particles = data_indices.ssize();
+    // Loop over some particle data `foo`.
+    for (int p = 0; p < num_particles; ++p) {
+      const auto& particle_data = foo_data[data_indices[p]];
+      // do something with particle_data.
+    }
+  */
+  const std::vector<int>& data_indices() const { return data_indices_; }
+
+  /* Returns the offsets of the base nodes for the particles in sorted order
+   (i.e. the same order as in `data_indices()`). */
+  const std::vector<uint64_t>& base_node_offsets() const {
+    return base_node_offsets_;
+  }
+
+  /* Returns an array of eight RangeVector objects, each representing a distinct
+   color partition of the particle data. After sorting, the disjoint subsets in
+   each partition can be processed safely in parallel using MPM transfer
+   kernels, without write hazards between concurrent threads.
+
+   Specifically, for each i from 0 to 7, colored_ranges()[i] is a collection of
+   non-overlapping sorted particle data indices ranges such that when two
+   different threads simultaneously perform particle to grid transfers for
+   particles in different ranges, they will not write to the same memory
+   locations in the grid data.
+
+   Example usage:
+   ```
+   for (int color = 0; color < 8; ++color) {
+     // It's safe to parallelize over the ranges within a single color.
+     #if defined(_OPENMP)
+     #pragma omp parallel for
+     #endif
+     for (const Range& range : particle_sorter.colored_ranges()[color]) {
+       for (int i = range.start; i < range.end; ++i) {
+         // do something with particle data at index
+         // `particle_sorter.data_indices()[i]`.
+       }
+     }
+   }
+   ```
+
+   Refer to Section 3.2 of the technical document of [Hu et al. 2018]
+   (https://yzhu.io/publication/mpmmls2018siggraph/supp.pdf) for more
+   details about how the colors are arranged and why there are 8 colors.
+
+   [Hu et al. 2018] Hu, Yuanming, et al. "A moving least squares material point
+   method with displacement discontinuity and two-way rigid body coupling." ACM
+   Transactions on Graphics (TOG) 37.4 (2018): 1-14. */
+  const std::array<RangeVector, 8>& colored_ranges() const {
+    return colored_ranges_;
+  }
+
+ private:
+  /* All but last entry store (sorted) indices of particles marking the boundary
+   of a new block (aka the sentinel particles). Recall that a `block` is a
+   subset of all grid nodes that occupy a consecutive blob of memory in SpGrid
+   (see class documentation of SpGrid for more details) The last entry stores
+   the number of particles. */
+  std::vector<int> sentinel_particles_;
+  std::vector<int> data_indices_;            // see data_indices().
+  std::vector<uint64_t> base_node_offsets_;  // see base_node_offsets().
+  /* The data being sorted -- it encodes the base node offset and the original
+   index of the particle. */
+  std::vector<uint64_t> particle_sorters_;
+  std::array<RangeVector, 8> colored_ranges_;  // see_colored_ranges().
+  /* The ranges of the particle data indices, each range corresponds to
+   particles inside a single SpGrid page. */
+  RangeVector ranges_;
+  /* Helper data structure for building `colored_ranges_`. */
+  std::array<std::vector<int>, 8> ranges_indices_;
+};
+
+}  // namespace internal
+}  // namespace mpm
+}  // namespace multibody
+}  // namespace drake
diff --git a/multibody/mpm/spgrid.h b/multibody/mpm/spgrid.h
index 93852fe40a5d..ff9851971262 100644
--- a/multibody/mpm/spgrid.h
+++ b/multibody/mpm/spgrid.h
@@ -24,6 +24,16 @@ namespace internal {
 template <typename T>
 using Pad = std::array<std::array<std::array<T, 3>, 3>, 3>;
 
+/* A subset of SPGrid flags and configs we care about. */
+struct SpGridFlags {
+  int log2_page{12};            // 4KB page size.
+  int log2_max_grid_size{10};   // Largest grid size is 1024 x 1024 x 1024.
+  int data_bits{6};             // Number of bits to represent GridData.
+  int num_nodes_in_block_x{4};  // Number of nodes in a block in x direction.
+  int num_nodes_in_block_y{4};  // Number of nodes in a block in y direction.
+  int num_nodes_in_block_z{4};  // Number of nodes in a block in z direction.
+};
+
 /* SpGrid is a wrapper class around the SPGrid library designed for managing
  sparse grid data in 3D space.
 
@@ -58,7 +68,7 @@ using Pad = std::array<std::array<std::array<T, 3>, 3>, 3>;
  @tparam GridData The type of data stored in the grid. It must satisfy the
  following requirements:
   - It must have a default constructor.
-  - It must have a member function `set_zero()` that sets the data to zero.
+  - It must have a member function `reset()`.
   - Its size must be less than or equal to 4KB.
  @tparam log2_max_grid_size
  SPGrid imposes a limit on the maximum number of grid points per dimension,
@@ -181,7 +191,7 @@ class SpGrid {
     }
     blocks_.Update_Block_Offsets();
     IterateGrid([](GridData* node_data) {
-      node_data->set_zero();
+      node_data->reset();
     });
   }
 
@@ -191,6 +201,9 @@ class SpGrid {
     const uint64_t world_space_offset = Mask::Linear_Offset(x, y, z);
     return Mask::Packed_Add(world_space_offset, origin_offset_);
   }
+  Offset CoordinateToOffset(const Vector3<int>& coord) const {
+    return CoordinateToOffset(coord[0], coord[1], coord[2]);
+  }
 
   /* Returns the 3D grid coordinates in world space given the offset (1D
    index) of a grid node.
@@ -316,6 +329,27 @@ class SpGrid {
     }
   }
 
+  /* Returns the flags associated with `this` SpGrid. */
+  SpGridFlags flags() const {
+    return SpGridFlags{.log2_page = kLog2Page,
+                       .log2_max_grid_size = log2_max_grid_size,
+                       .data_bits = kDataBits,
+                       .num_nodes_in_block_x = kNumNodesInBlockX,
+                       .num_nodes_in_block_y = kNumNodesInBlockY,
+                       .num_nodes_in_block_z = kNumNodesInBlockZ};
+  }
+
+  /* Returns the color of the block given the page bits of the node offset.
+   Blocks with different colors are guaranteed to be non-adjacent. */
+  static int get_color(uint64_t page) {
+    /* According to [Setaluri et al. 2014], the blocks are arranged in a 3D
+    space using Z-order curve. Blocks with 8 blocks apart are guaranteed to be
+    non-adjacent. */
+    constexpr int kNumColors = 8;
+    int color = (page & (kNumColors - 1));
+    return color;
+  }
+
  private:
   /* Returns reference to the data at the given grid offset.
    @pre Given `offset` is valid. */
diff --git a/multibody/mpm/test/grid_data_test.cc b/multibody/mpm/test/grid_data_test.cc
index 474252e08d89..964b1e7f6853 100644
--- a/multibody/mpm/test/grid_data_test.cc
+++ b/multibody/mpm/test/grid_data_test.cc
@@ -92,9 +92,9 @@ TYPED_TEST(GridDataTest, Reset) {
 
   data.reset();
   EXPECT_TRUE(data.index_or_flag.is_inactive());
-  EXPECT_NE(data.scratch, data.scratch);
-  EXPECT_NE(data.v, data.v);
-  EXPECT_TRUE(std::isnan(data.m));
+  EXPECT_EQ(data.scratch, Vector3<T>::Zero());
+  EXPECT_EQ(data.v, Vector3<T>::Zero());
+  EXPECT_EQ(data.m, 0);
 }
 
 TYPED_TEST(GridDataTest, Equality) {
diff --git a/multibody/mpm/test/particle_sorter_test.cc b/multibody/mpm/test/particle_sorter_test.cc
new file mode 100644
index 000000000000..3cfd0f75050a
--- /dev/null
+++ b/multibody/mpm/test/particle_sorter_test.cc
@@ -0,0 +1,151 @@
+#include "drake/multibody/mpm/particle_sorter.h"
+
+#include <numeric>
+
+#include <gtest/gtest.h>
+
+#include "drake/multibody/mpm/bspline_weights.h"
+#include "drake/multibody/mpm/grid_data.h"
+#include "drake/multibody/mpm/spgrid.h"
+
+namespace drake {
+namespace multibody {
+namespace mpm {
+namespace internal {
+namespace {
+
+using Eigen::Vector3i;
+
+GTEST_TEST(ConvertToRangeVectorTest, ConvertToRangeVector) {
+  std::vector<int> data = {1, 3, 6, 9};
+  RangeVector ranges;
+  ConvertToRangeVector(data, &ranges);
+  ASSERT_EQ(ranges.size(), 3);
+  EXPECT_EQ(ranges[0].start, 1);
+  EXPECT_EQ(ranges[0].end, 3);
+  EXPECT_EQ(ranges[1].start, 3);
+  EXPECT_EQ(ranges[1].end, 6);
+  EXPECT_EQ(ranges[2].start, 6);
+  EXPECT_EQ(ranges[2].end, 9);
+}
+
+/* Sample N^3 points in the box [0, 1] x [0, 1] x [0, 1]*/
+std::vector<Vector3<double>> SamplePoints(int N) {
+  std::vector<Vector3<double>> points;
+  for (int i = 0; i < N; ++i) {
+    /* Multiply by a large prime number and mod by N to avoid the sampled points
+     to be ordered in any obvious way. */
+    int ii = (17 * i) % N;
+    for (int j = 0; j < N; ++j) {
+      int jj = (23 * j) % N;
+      for (int k = 0; k < N; ++k) {
+        int kk = (29 * k) % N;
+        points.push_back(
+            Vector3<double>(ii / (N - 1.0), jj / (N - 1.0), kk / (N - 1.0)));
+      }
+    }
+  }
+  return points;
+}
+
+/* Returns true if the one-rings of the two given grid nodes overlap. */
+bool OverlapOneRing(const Vector3i& a, const Vector3i& b) {
+  return ((a - b).cwiseAbs().maxCoeff() <= 2);
+}
+
+GTEST_TEST(ParticleSorterTest, Sort) {
+  using Grid = SpGrid<GridData<double>>;
+  const Grid spgrid;
+  const double dx = 0.2;
+  const std::vector<Vector3<double>> particle_positions = SamplePoints(7);
+
+  ParticleSorter sorter;
+  sorter.Sort(spgrid, dx, particle_positions);
+  const std::vector<int>& data_indices = sorter.data_indices();
+  const std::vector<uint64_t>& base_node_offsets = sorter.base_node_offsets();
+  for (int i = 1; i < ssize(data_indices); ++i) {
+    /* Confirm that after the sort, the base nodes are in ascending order. */
+    EXPECT_LE(base_node_offsets[i - 1], base_node_offsets[i]);
+    /* When base nodes are tied, confirm that indices are used to break tie. */
+    if (base_node_offsets[i - 1] == base_node_offsets[i]) {
+      EXPECT_LT(data_indices[i - 1], data_indices[i]);
+    }
+  }
+
+  /* Confirm that the sorted base nodes are indeed a permutation of the original
+   base nodes. */
+  for (int i = 0; i < ssize(particle_positions); ++i) {
+    const Vector3<double>& particle_position =
+        particle_positions[data_indices[i]];
+    const Vector3i expected_base_node =
+        ComputeBaseNode<double>(particle_position / dx);
+    const uint64_t expected_base_node_offset =
+        spgrid.CoordinateToOffset(expected_base_node);
+    EXPECT_EQ(base_node_offsets[i], expected_base_node_offset);
+  }
+
+  /* Confirm that the data indices are indeed a permutation of the original
+   indices. */
+  std::vector<int> data_indices_copy = data_indices;
+  std::sort(data_indices_copy.begin(), data_indices_copy.end());
+  std::vector<int> expected_sorted_indices(ssize(particle_positions));
+  std::iota(expected_sorted_indices.begin(), expected_sorted_indices.end(), 0);
+  EXPECT_EQ(data_indices_copy, expected_sorted_indices);
+
+  /* Confirm that the colored ranges are indeed write-hazard-free. */
+  const std::array<RangeVector, 8>& colored_ranges = sorter.colored_ranges();
+  for (int c = 0; c < 8; ++c) {
+    const RangeVector& ranges = colored_ranges[c];
+    for (int i = 0; i < ssize(ranges); ++i) {
+      for (int j = i + 1; j < ssize(ranges); ++j) {
+        const Range& range_i = ranges[i];
+        const Range& range_j = ranges[j];
+        for (int p = range_i.start; p < range_i.end; ++p) {
+          for (int q = range_j.start; q < range_j.end; ++q) {
+            const Vector3i base_node_p =
+                spgrid.OffsetToCoordinate(base_node_offsets[p]);
+            const Vector3i base_node_q =
+                spgrid.OffsetToCoordinate(base_node_offsets[q]);
+            /* As long as one-rings of the base nodes of the particles don't
+             overlap, there's no write-hazard. */
+            EXPECT_FALSE(OverlapOneRing(base_node_p, base_node_q));
+          }
+        }
+      }
+    }
+  }
+
+  /* Confirm that GetActiveBlockOffsets() is correct. To do that, we allocate
+   another SpGrid with the returned offsets and then check that all base nodes
+   to the particles, as well as their one rings, are allocated. (That's the
+   required allocation for the transfer.) */
+  const std::vector<uint64_t> active_block_offsets =
+      sorter.GetActiveBlockOffsets();
+  Grid another_grid;
+  another_grid.Allocate(active_block_offsets);
+  std::set<uint64_t> allocated_offsets;
+  auto callback = [&](uint64_t offset, const GridData<double>&) {
+    allocated_offsets.insert(offset);
+  };
+
+  another_grid.IterateConstGridWithOffset(callback);
+  for (uint64_t offset : base_node_offsets) {
+    const Vector3i coord = another_grid.OffsetToCoordinate(offset);
+    /* Get the offsets of nodes in the one-ring of the base node. */
+    for (int i = -1; i <= 1; ++i) {
+      for (int j = -1; j <= 1; ++j) {
+        for (int k = -1; k <= 1; ++k) {
+          Vector3i neighbor = coord + Vector3i(i, j, k);
+          uint64_t neighbor_offset = another_grid.CoordinateToOffset(neighbor);
+          EXPECT_TRUE(allocated_offsets.contains(neighbor_offset));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace mpm
+}  // namespace multibody
+}  // namespace drake
diff --git a/multibody/mpm/test/spgrid_test.cc b/multibody/mpm/test/spgrid_test.cc
index d05e02a7d901..7ed5d04df11c 100644
--- a/multibody/mpm/test/spgrid_test.cc
+++ b/multibody/mpm/test/spgrid_test.cc
@@ -15,7 +15,7 @@ const int kLog2MaxGridSize = 5;
 
 struct GridData {
   Vector4d data;
-  void set_zero() { data.setZero(); }
+  void reset() { data.setZero(); }
 };
 
 bool operator==(const GridData& lhs, const GridData& rhs) {
@@ -93,6 +93,10 @@ GTEST_TEST(SpGridTest, CoordinateOffsetConversion) {
   EXPECT_EQ(coordinate1, Vector3<int>(10, 0, 0));
   Vector3<int> coordinate2 = grid.OffsetToCoordinate(offset2);
   EXPECT_EQ(coordinate2, Vector3<int>(0, 10, 0));
+
+  /* Test the two flavors of CoordinateToOffset. */
+  EXPECT_EQ(grid.CoordinateToOffset(Vector3<int>(1, 2, 3)),
+            grid.CoordinateToOffset(1, 2, 3));
 }
 
 GTEST_TEST(SpGridTest, SetPadDataAndGetPadData) {

From b30326c2f2e309605e7324b52ac337b417e6ac73 Mon Sep 17 00:00:00 2001
From: Xuchen Han <xuchen.han@tri.global>
Date: Thu, 23 Jan 2025 17:35:16 -0800
Subject: [PATCH 2/2] Address platform comments

---
 multibody/mpm/BUILD.bazel                  |  15 ++
 multibody/mpm/particle_sorter.cc           |  72 ++++++
 multibody/mpm/particle_sorter.h            | 273 ++++++++++-----------
 multibody/mpm/spgrid.h                     |  17 +-
 multibody/mpm/spgrid_flags.h               |  21 ++
 multibody/mpm/test/particle_sorter_test.cc |  36 +--
 6 files changed, 266 insertions(+), 168 deletions(-)
 create mode 100644 multibody/mpm/spgrid_flags.h

diff --git a/multibody/mpm/BUILD.bazel b/multibody/mpm/BUILD.bazel
index 4ceebdd93b8c..fd906997d8df 100644
--- a/multibody/mpm/BUILD.bazel
+++ b/multibody/mpm/BUILD.bazel
@@ -18,6 +18,7 @@ drake_cc_package_library(
         ":grid_data",
         ":particle_data",
         ":particle_sorter",
+        ":spgrid_flags",
     ],
 )
 
@@ -71,6 +72,7 @@ drake_cc_library(
     ],
     deps = [
         ":bspline_weights",
+        ":spgrid_flags",
         "//common:essential",
         "//math:gradient",
     ],
@@ -88,11 +90,24 @@ drake_cc_library_linux_only(
         "spgrid.h",
     ],
     deps = [
+        ":spgrid_flags",
         "//common:essential",
         "@spgrid_internal",
     ],
 )
 
+drake_cc_library(
+    name = "spgrid_flags",
+    hdrs = [
+        "spgrid_flags.h",
+    ],
+    deps = [
+        ":bspline_weights",
+        "//common:essential",
+        "//math:gradient",
+    ],
+)
+
 drake_cc_googletest(
     name = "bspline_weights_test",
     deps = [
diff --git a/multibody/mpm/particle_sorter.cc b/multibody/mpm/particle_sorter.cc
index 725e31ca779e..1c9c38e3204c 100644
--- a/multibody/mpm/particle_sorter.cc
+++ b/multibody/mpm/particle_sorter.cc
@@ -21,6 +21,78 @@ std::vector<uint64_t> ParticleSorter::GetActiveBlockOffsets() const {
   return offsets;
 }
 
+void ParticleSorter::Initialize(int num_particles) {
+  particle_sorters_.resize(num_particles);
+  base_node_offsets_.resize(num_particles);
+  data_indices_.resize(num_particles);
+  sentinel_particles_.clear();
+  for (int c = 0; c < 8; ++c) {
+    ranges_indices_[c].clear();
+    colored_ranges_[c].clear();
+  }
+}
+
+void ParticleSorter::ComputeBitInformation(const SpGridFlags& flags,
+                                           int num_particles) {
+  /* Recall that all base node offsets in SPGrid have the following bit
+   representation:
+
+       page_bits | block_bits | data_bits
+
+   All the data bits are zero, guaranteed by SPGrid. In addition, a
+   few bits (starting from the left) in the page bits are also zero (we
+   explain why below). Therefore, we can truncate the zero bits in the
+   base_node_offset and squeeze in the original particle index into the lower
+   bits of a single 64 bit unsigned integer.
+
+   Q: Why are the left most bits in page_bits zero?
+
+   A: Since we cap the total number of allowed grid nodes. There are at most
+   2^(3*log2_max_grid_size) number of grid nodes in the grid, which requires
+   3*log2_max_grid_size bits to represent. SPGrid uses page_bits and
+   block_bits to represent the grid nodes (and data_bits are used, well, to
+   store data). The page_bits and block_bits have (64 - data_bits) in total,
+   and only 3*log2_max_grid_size bits are used, so the left most
+   (64 - data_bits - 3*log2_max_grid_size) bits are zero.
+
+   Q: So how many zeros can we truncate in the bits for base_node_offsets to
+   make room for the particle indices? And is it enough room?
+
+   A: There are (64 - data_bits - 3*log2_max_grid_size) zeros on the left and
+   data_bits zeros on the right, so they add up to (64 - 3*log2_max_grid_size)
+   zeros in total. with a typical log2_max_grid_size == 10, we have
+   (64 - 3*10) = 34 bits to work with, which is more than enough to store the
+   32 bit particle indices.
+
+   Therefore, the overall strategy is to left shift the base node offset by
+   (64 - data_bits - 3*log2_max_grid_size), which leaves the lower
+   (64 - 3*log2_max_grid_size) bits to be zero. We then use the right most 32
+   bits to store the original index of the particles. */
+  log2_max_grid_size_ = flags.log2_max_grid_size;
+  data_bits_ = flags.data_bits;
+  log2_page_ = flags.log2_page;
+  zero_page_bits_ = 64 - data_bits_ - 3 * log2_max_grid_size_;
+  index_bits_ = 64 - 3 * log2_max_grid_size_;  // = zero_page_bits_ + data_bits_
+  // TODO(xuchenhan-tri): This should be checked when we register particles to
+  // the MPM model.
+  /* Confirm that index_bits_ is enough to store num_particles. */
+  const uint64_t capacity = (uint64_t(1) << index_bits_);
+  DRAKE_DEMAND(capacity > static_cast<uint64_t>(num_particles));
+}
+
+void ParticleSorter::UnpackResults() {
+  const uint64_t index_mask = ((uint64_t(1) << index_bits_) - 1);
+
+  for (int p = 0; p < static_cast<int>(particle_sorters_.size()); ++p) {
+    /* Lower bits store the particle's original index. */
+    data_indices_[p] = (particle_sorters_[p] & index_mask);
+
+    /* Upper bits store the base_node_offset (shifted); shift back & re-apply
+     data_bits_. */
+    base_node_offsets_[p] = (particle_sorters_[p] >> index_bits_) << data_bits_;
+  }
+}
+
 }  // namespace internal
 }  // namespace mpm
 }  // namespace multibody
diff --git a/multibody/mpm/particle_sorter.h b/multibody/mpm/particle_sorter.h
index 5be902337468..9544af0c8bc9 100644
--- a/multibody/mpm/particle_sorter.h
+++ b/multibody/mpm/particle_sorter.h
@@ -10,6 +10,7 @@
 #include "drake/common/ssize.h"
 #include "drake/math/autodiff_gradient.h"
 #include "drake/multibody/mpm/bspline_weights.h"
+#include "drake/multibody/mpm/spgrid_flags.h"
 
 namespace drake {
 namespace multibody {
@@ -17,14 +18,25 @@ namespace mpm {
 namespace internal {
 
 /* A range of integers [start, end). */
-struct Range {
-  int start;
-  int end;
+class Range {
+ public:
+  DRAKE_DEFAULT_COPY_AND_MOVE_AND_ASSIGN(Range);
+
+  Range(int start, int end) : start_(start), end_(end) {
+    DRAKE_ASSERT(start_ < end_);
+  }
+
+  int start() const { return start_; }
+  int end() const { return end_; }
+
+ private:
+  int start_{};
+  int end_{};
 };
 
 using RangeVector = std::vector<Range>;
 
-/* Given a sorted vector of integers key points, converts it to a vector of
+/* Given a sorted vector of integer key points, converts it to a vector of
  consecutive ranges. For example, given the input {1, 3, 6, 9}, the output
  should be {{1, 3}, {3, 6}, {6, 9}}.
  @pre data is sorted in ascending order (no repeats) and has at least two
@@ -64,19 +76,9 @@ class ParticleSorter {
   template <typename T, typename SpGrid>
   void Sort(const SpGrid& spgrid, double dx,
             const std::vector<Vector3<T>>& particle_positions) {
-    const auto& flags = spgrid.flags();
-    const int log2_max_grid_size = flags.log2_max_grid_size;
-    const int data_bits = flags.data_bits;
-    const int log2_page = flags.log2_page;
-
     const int num_particles = particle_positions.size();
-    data_indices_.resize(num_particles);
-    base_node_offsets_.resize(num_particles);
-    particle_sorters_.resize(num_particles);
-
-    // TODO(xuchenhan-tri): Switch to using radix sort to exploit the fact that
-    // our keys are 64 bit unsigned integers. It's proven to be much faster than
-    // std::sort in practice.
+    /* Resize data structures and clear old data. */
+    Initialize(num_particles);
 
     /* We sort particles first based on their base node offsets in the spgrid;
      if they are the same, then we sort the particles by their original indices.
@@ -90,125 +92,27 @@ class ParticleSorter {
          bits_for_base_node_offset | bits_for_particle_index.
 
      However, that's easier said than done, because the base node offset is
-     already taking 64 bits! Fortunately, we notice that all base node offsets
-     in the spgrid of the particle have the following structure
-
-         page_bits | block_bits | data_bits
-
-     with all the data bits being zero (guaranteed by SPGrid). In addition, a
-     few bits (starting from the left) in the page bits are also zero (we
-     explain why below). Therefore, we can truncate the zero bits in the
-     base_node_offset and squeeze in the original particle index into the lower
-     bits of a single 64 bit unsigned integer.
-
-     Q: Why are the left most bits in page_bits zero?
-
-     A: Since we cap the total number of allowed grid nodes. There are at most
-     2^(3*log2_max_grid_size) number of grid nodes in the grid, which requires
-     3*log2_max_grid_size bits to represent. SPGrid uses page_bits and
-     block_bits to represent the grid nodes (and data_bits are used, well, to
-     store data). The page_bits and block_bits have (64 - data_bits) in total,
-     and only 3*log2_max_grid_size bits are used, so the left most
-     (64 - data_bits - 3*log2_max_grid_size) bits are zero.
-
-     Q: So how many zeros can we truncate in the bits for base_node_offsets to
-     make room for the particle indices? And is it enough room?
-
-     A: There are (64 - data_bits - 3*log2_max_grid_size) zeros on the left and
-     data_bits zeros on the right, so they add up to (64 - 3*log2_max_grid_size)
-     zeros in total. with a typical log2_max_grid_size == 10, we have
-     (64 - 3*10) = 34 bits to work with, which is more than enough to store the
-     32 bit particle indices.
-
-     Therefore, the overall strategy is to left shift the base node offset by
-     (64 - data_bits - 3*log2_max_grid_size), which leaves the lower
-     (64 - 3*log2_max_grid_size) bits to be zero. We then use the right most 32
-     bits to store the original index of the particles. */
-    const int index_bits = 64 - 3 * log2_max_grid_size;
-    // TODO(xuchenhan-tri): The following check (i.e. that we don't have too
-    // many particles) should be enforced when we register the particles to the
-    // MPM model.
-    /* Make sure that we have enough bits to store the index of the particles.
-     */
-    DRAKE_DEMAND(uint64_t(1) << index_bits >
-                 static_cast<uint64_t>(num_particles));
-    const int zero_page_bits = 64 - data_bits - 3 * log2_max_grid_size;
-
-    for (int p = 0; p < num_particles; ++p) {
-      /* We allow T == AutoDiffXd only for testing purpose. The positions in
-       those tests are guaranteed to have zero gradients. Here we cast
-       particle_x to double regardless of what type T is to simplify the code
-       path for computing base node immediately below. */
-      const auto& particle_x = [&]() -> const Vector3<double>& {
-        if constexpr (std::is_same_v<T, double>) {
-          return particle_positions[p];
-        } else if constexpr (std::is_same_v<T, float>) {
-          return particle_positions[p].template cast<double>();
-        } else {
-          return math::DiscardZeroGradient(particle_positions[p]);
-        }
-      }();
-      const Vector3<int> base_node = ComputeBaseNode<double>(particle_x / dx);
-      base_node_offsets_[p] =
-          spgrid.CoordinateToOffset(base_node[0], base_node[1], base_node[2]);
-      data_indices_[p] = p;
-      /* Confirm the data bits of the base node offset are all zero. */
-      DRAKE_ASSERT((base_node_offsets_[p] & ((uint64_t(1) << data_bits) - 1)) ==
-                   0);
-      /* Confirm the left most bits in the page bits are unused. */
-      DRAKE_ASSERT((base_node_offsets_[p] &
-                    ~((uint64_t(1) << (64 - zero_page_bits)) - 1)) == 0);
-      particle_sorters_[p] =
-          (base_node_offsets_[p] << zero_page_bits) + data_indices_[p];
-    }
-    std::sort(particle_sorters_.begin(), particle_sorters_.end());
-
-    /* Peel off the data indices and the base node offsets from
-     particle_sorters_. Meanwhile, reorder the data indices and the base node
-     offsets based on the sorting results. */
-    const uint64_t index_mask = ((uint64_t(1) << index_bits) - 1);
-    for (int p = 0; p < ssize(particle_sorters_); ++p) {
-      data_indices_[p] = particle_sorters_[p] & index_mask;
-      /* Right shift to erase the index bits, and then left shifts to recover
-       the zero data bits in base node offsets (as required by SPGrid). */
-      base_node_offsets_[p] = (particle_sorters_[p] >> index_bits) << data_bits;
-    }
-
-    /* Record the sentinel particles and the coloring of the blocks. */
-    sentinel_particles_.clear();
-    for (int b = 0; b < 8; ++b) {
-      colored_ranges_[b].clear();
-    }
-    uint64_t previous_page{};
-    int range_index = 0;
-    for (int p = 0; p < num_particles; ++p) {
-      /* The bits in the offset are ordered as follows:
+     already taking 64 bits! Fortunately, there are some dead bits that we can
+     truncate to make some room. We now check how many bits we can truncate
+     based on information from the spgrid flags and how many particles we have.
+    */
+    const auto& flags = spgrid.flags();
+    ComputeBitInformation(flags, num_particles);
 
-        page bits | block bits | data bits
+    /* Build 64-bit keys that encode (page_offset << shift) + particle_index. */
+    BuildKeys(spgrid, dx, particle_positions);
 
-       block bits and data bits add up to log2_page bits.
-       We right shift to get the page bits. If the page bits of two offsets are
-       identical, it means that they belong to the same page.
+    // TODO(xuchenhan-tri): Switch to using radix sort to exploit the fact that
+    // our keys are 64 bit unsigned integers. It's proven to be much faster than
+    // std::sort in practice.
+    std::sort(particle_sorters_.begin(), particle_sorters_.end());
 
-       We color the pages so that pages that are adjacent in 3D space all have
-       different colors, as described in [Hu et al. 2018]. */
-      const uint64_t page = base_node_offsets_[p] >> log2_page;
-      if (p == 0 || previous_page != page) {
-        previous_page = page;
-        sentinel_particles_.push_back(p);
-        const int color = spgrid.get_color(page);
-        ranges_indices_[color].push_back(range_index++);
-      }
-    }
-    sentinel_particles_.push_back(num_particles);
+    /* Extract sorted data indices and base node offsets from the sorted keys.
+     */
+    UnpackResults();
 
-    ConvertToRangeVector(sentinel_particles_, &ranges_);
-    for (int c = 0; c < 8; ++c) {
-      colored_ranges_[c].clear();
-      for (int r : ranges_indices_[c]) {
-        colored_ranges_[c].push_back(ranges_[r]);
-      }
-    }
+    /* Build the results for `colored_ranges()`. */
+    BuildColoredRanges(spgrid, num_particles);
   }
 
   /* Returns a vector of representative grid node offsets for all active SpGrid
@@ -258,7 +162,7 @@ class ParticleSorter {
      #pragma omp parallel for
      #endif
      for (const Range& range : particle_sorter.colored_ranges()[color]) {
-       for (int i = range.start; i < range.end; ++i) {
+       for (int i = range.start(); i < range.end(); ++i) {
          // do something with particle data at index
          // `particle_sorter.data_indices()[i]`.
        }
@@ -278,11 +182,99 @@ class ParticleSorter {
   }
 
  private:
-  /* All but last entry store (sorted) indices of particles marking the boundary
-   of a new block (aka the sentinel particles). Recall that a `block` is a
-   subset of all grid nodes that occupy a consecutive blob of memory in SpGrid
-   (see class documentation of SpGrid for more details) The last entry stores
-   the number of particles. */
+  /* Helper for Sort(). Resizes all containers and clear old data. */
+  void Initialize(int num_particles);
+
+  /* Helper for Sort(). Informs on to perform bit manipulation and squeeze base
+   node offsets and particle indices into a single 64 bit unsigned int. */
+  void ComputeBitInformation(const SpGridFlags& flags, int num_particles);
+
+  /* Helper for `Sort()` that builds 64-bit sort keys for each particle. */
+  template <typename T, typename SpGrid>
+  void BuildKeys(const SpGrid& spgrid, double dx,
+                 const std::vector<Vector3<T>>& particle_positions) {
+    const int num_particles = particle_positions.size();
+    for (int p = 0; p < num_particles; ++p) {
+      /* We allow T == AutoDiffXd only for testing purpose. The positions in
+       those tests are guaranteed to have zero gradients. Here we cast
+       particle_x to double regardless of what type T is to simplify the code
+       path for computing base node immediately below.
+       The computation is simple enough that converting from float to double
+       isn't going to trigger any noticeable overhead. (The main advantage for
+       using floats is its compact memory footprint anyway.) */
+      const auto& particle_x = [&]() -> const Vector3<double>& {
+        if constexpr (std::is_same_v<T, double>) {
+          return particle_positions[p];
+        } else if constexpr (std::is_same_v<T, float>) {
+          return particle_positions[p].template cast<double>();
+        } else {
+          return math::DiscardZeroGradient(particle_positions[p]);
+        }
+      }();
+      const Vector3<int> base_node = ComputeBaseNode<double>(particle_x / dx);
+      base_node_offsets_[p] =
+          spgrid.CoordinateToOffset(base_node[0], base_node[1], base_node[2]);
+      /* Confirm the data bits of the base node offset are all zero. */
+      DRAKE_ASSERT(
+          (base_node_offsets_[p] & ((uint64_t(1) << data_bits_) - 1)) == 0);
+      /* Confirm the left most bits in the page bits are unused. */
+      DRAKE_ASSERT((base_node_offsets_[p] &
+                    ~((uint64_t(1) << (64 - zero_page_bits_)) - 1)) == 0);
+      particle_sorters_[p] = (base_node_offsets_[p] << zero_page_bits_) + p;
+    }
+  }
+
+  /* Helper for `Sort()` that unpacks the key into base node offsets and data
+   indices. */
+  void UnpackResults();
+
+  /* Helper for `Sort`()` that builds `colored_ranges_`. */
+  template <typename SpGrid>
+  void BuildColoredRanges(const SpGrid& spgrid, int num_particles) {
+    /* Keep track of where each new "page" begins. */
+    uint64_t previous_page = 0;
+    int range_index = 0;
+
+    for (int p = 0; p < num_particles; ++p) {
+      /* The bits in the offset are ordered as follows:
+
+        page bits | block bits | data bits
+
+       block bits and data bits add up to log2_page bits.
+       We right shift to get the page bits. If the page bits of two offsets are
+       identical, it means that they belong to the same page.
+
+       We color the pages so that pages that are adjacent in 3D space all have
+       different colors, as described in [Hu et al. 2018]. */
+      const uint64_t page = base_node_offsets_[p] >> log2_page_;
+
+      if (p == 0 || page != previous_page) {
+        previous_page = page;
+        sentinel_particles_.push_back(p);
+        const int color = spgrid.get_color(page);
+        ranges_indices_[color].push_back(range_index++);
+      }
+    }
+    /* Final sentinel at the end (always the number of particles). */
+    sentinel_particles_.push_back(num_particles);
+
+    /* Convert sentinel indices -> (start, end) ranges. */
+    ConvertToRangeVector(sentinel_particles_, &ranges_);
+
+    /* For each color, pick out the relevant [start, end) pairs. */
+    for (int c = 0; c < 8; ++c) {
+      colored_ranges_[c].clear();
+      for (int r : ranges_indices_[c]) {
+        colored_ranges_[c].push_back(ranges_[r]);
+      }
+    }
+  }
+
+  /* All but last entry store (sorted) indices of particles marking the
+   boundary of a new block (aka the sentinel particles). Recall that a `block`
+   is a subset of all grid nodes that occupy a consecutive blob of memory in
+   SpGrid (see class documentation of SpGrid for more details) The last entry
+   stores the number of particles. */
   std::vector<int> sentinel_particles_;
   std::vector<int> data_indices_;            // see data_indices().
   std::vector<uint64_t> base_node_offsets_;  // see base_node_offsets().
@@ -295,6 +287,13 @@ class ParticleSorter {
   RangeVector ranges_;
   /* Helper data structure for building `colored_ranges_`. */
   std::array<std::vector<int>, 8> ranges_indices_;
+
+  /* Helper data for `Sort()`. */
+  int log2_max_grid_size_{};
+  int data_bits_{};
+  int log2_page_{};
+  int zero_page_bits_{};
+  int index_bits_{};
 };
 
 }  // namespace internal
diff --git a/multibody/mpm/spgrid.h b/multibody/mpm/spgrid.h
index ff9851971262..873b62dc6893 100644
--- a/multibody/mpm/spgrid.h
+++ b/multibody/mpm/spgrid.h
@@ -14,6 +14,7 @@
 
 #include "drake/common/drake_copyable.h"
 #include "drake/common/eigen_types.h"
+#include "drake/multibody/mpm/spgrid_flags.h"
 
 namespace drake {
 namespace multibody {
@@ -24,16 +25,6 @@ namespace internal {
 template <typename T>
 using Pad = std::array<std::array<std::array<T, 3>, 3>, 3>;
 
-/* A subset of SPGrid flags and configs we care about. */
-struct SpGridFlags {
-  int log2_page{12};            // 4KB page size.
-  int log2_max_grid_size{10};   // Largest grid size is 1024 x 1024 x 1024.
-  int data_bits{6};             // Number of bits to represent GridData.
-  int num_nodes_in_block_x{4};  // Number of nodes in a block in x direction.
-  int num_nodes_in_block_y{4};  // Number of nodes in a block in y direction.
-  int num_nodes_in_block_z{4};  // Number of nodes in a block in z direction.
-};
-
 /* SpGrid is a wrapper class around the SPGrid library designed for managing
  sparse grid data in 3D space.
 
@@ -342,9 +333,9 @@ class SpGrid {
   /* Returns the color of the block given the page bits of the node offset.
    Blocks with different colors are guaranteed to be non-adjacent. */
   static int get_color(uint64_t page) {
-    /* According to [Setaluri et al. 2014], the blocks are arranged in a 3D
-    space using Z-order curve. Blocks with 8 blocks apart are guaranteed to be
-    non-adjacent. */
+    /* According to [Setaluri et al. 2014], the blocks are arranged in 3D space
+     along a Z-order curve, ensuring that any two blocks whose Z-order indices
+     are eight apart are guaranteed to be non-adjacent. */
     constexpr int kNumColors = 8;
     int color = (page & (kNumColors - 1));
     return color;
diff --git a/multibody/mpm/spgrid_flags.h b/multibody/mpm/spgrid_flags.h
new file mode 100644
index 000000000000..bee2ec4af3d4
--- /dev/null
+++ b/multibody/mpm/spgrid_flags.h
@@ -0,0 +1,21 @@
+#pragma once
+
+namespace drake {
+namespace multibody {
+namespace mpm {
+namespace internal {
+
+/* A subset of SPGrid flags and configs we care about. */
+struct SpGridFlags {
+  int log2_page{12};            // 4KB page size.
+  int log2_max_grid_size{10};   // Largest grid size is 1024 x 1024 x 1024.
+  int data_bits{6};             // Number of bits to represent GridData.
+  int num_nodes_in_block_x{4};  // Number of nodes in a block in x direction.
+  int num_nodes_in_block_y{4};  // Number of nodes in a block in y direction.
+  int num_nodes_in_block_z{4};  // Number of nodes in a block in z direction.
+};
+
+}  // namespace internal
+}  // namespace mpm
+}  // namespace multibody
+}  // namespace drake
diff --git a/multibody/mpm/test/particle_sorter_test.cc b/multibody/mpm/test/particle_sorter_test.cc
index 3cfd0f75050a..cb2e3204354d 100644
--- a/multibody/mpm/test/particle_sorter_test.cc
+++ b/multibody/mpm/test/particle_sorter_test.cc
@@ -21,27 +21,27 @@ GTEST_TEST(ConvertToRangeVectorTest, ConvertToRangeVector) {
   RangeVector ranges;
   ConvertToRangeVector(data, &ranges);
   ASSERT_EQ(ranges.size(), 3);
-  EXPECT_EQ(ranges[0].start, 1);
-  EXPECT_EQ(ranges[0].end, 3);
-  EXPECT_EQ(ranges[1].start, 3);
-  EXPECT_EQ(ranges[1].end, 6);
-  EXPECT_EQ(ranges[2].start, 6);
-  EXPECT_EQ(ranges[2].end, 9);
+  EXPECT_EQ(ranges[0].start(), 1);
+  EXPECT_EQ(ranges[0].end(), 3);
+  EXPECT_EQ(ranges[1].start(), 3);
+  EXPECT_EQ(ranges[1].end(), 6);
+  EXPECT_EQ(ranges[2].start(), 6);
+  EXPECT_EQ(ranges[2].end(), 9);
 }
 
-/* Sample N^3 points in the box [0, 1] x [0, 1] x [0, 1]*/
-std::vector<Vector3<double>> SamplePoints(int N) {
+/* Sample n^3 points in the box [0, 1] x [0, 1] x [0, 1]*/
+std::vector<Vector3<double>> SamplePoints(int n) {
   std::vector<Vector3<double>> points;
-  for (int i = 0; i < N; ++i) {
-    /* Multiply by a large prime number and mod by N to avoid the sampled points
+  for (int i = 0; i < n; ++i) {
+    /* Multiply by a large prime number and mod by n to avoid the sampled points
      to be ordered in any obvious way. */
-    int ii = (17 * i) % N;
-    for (int j = 0; j < N; ++j) {
-      int jj = (23 * j) % N;
-      for (int k = 0; k < N; ++k) {
-        int kk = (29 * k) % N;
+    int ii = (17 * i) % n;
+    for (int j = 0; j < n; ++j) {
+      int jj = (23 * j) % n;
+      for (int k = 0; k < n; ++k) {
+        int kk = (29 * k) % n;
         points.push_back(
-            Vector3<double>(ii / (N - 1.0), jj / (N - 1.0), kk / (N - 1.0)));
+            Vector3<double>(ii / (n - 1.0), jj / (n - 1.0), kk / (n - 1.0)));
       }
     }
   }
@@ -100,8 +100,8 @@ GTEST_TEST(ParticleSorterTest, Sort) {
       for (int j = i + 1; j < ssize(ranges); ++j) {
         const Range& range_i = ranges[i];
         const Range& range_j = ranges[j];
-        for (int p = range_i.start; p < range_i.end; ++p) {
-          for (int q = range_j.start; q < range_j.end; ++q) {
+        for (int p = range_i.start(); p < range_i.end(); ++p) {
+          for (int q = range_j.start(); q < range_j.end(); ++q) {
             const Vector3i base_node_p =
                 spgrid.OffsetToCoordinate(base_node_offsets[p]);
             const Vector3i base_node_q =