Merge pull request #462 from ValeevGroup/evaleev/fixup/gemm-benchmark…

…s-tiling gemm examples support nonuniform tiling
ValeevGroup · Jul 23, 2024 · cc0cd50 · cc0cd50
2 parents 253d59c + 9d5ad23
commit cc0cd50
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 83 deletions.
diff --git a/examples/device/ta_cc_abcd_device.cpp b/examples/device/ta_cc_abcd_device.cpp
@@ -182,8 +182,8 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
   const double flops_per_fma =
       (complex_T ? 8 : 2);  // 1 multiply takes 6/1 flops for complex/real
                             // 1 add takes 2/1 flops for complex/real
-  const double n_gflop = flops_per_fma * std::pow(n_occ, 2) *
-                         std::pow(n_uocc, 4) / std::pow(1024., 3);
+  const double n_gflop =
+      flops_per_fma * std::pow(n_occ, 2) * std::pow(n_uocc, 4) / 1e9;
 
   using deviceTile =
       btas::Tensor<T, TA::Range, TiledArray::device_um_btas_varray<T>>;

diff --git a/examples/device/ta_dense_device.cpp b/examples/device/ta_dense_device.cpp
@@ -36,10 +36,6 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
   using RT = TiledArray::detail::scalar_t<Storage>;
   constexpr auto complex_T = TiledArray::detail::is_complex_v<T>;
 
-  const std::size_t Tm = Nm / Bm;
-  const std::size_t Tn = Nn / Bn;
-  const std::size_t Tk = Nk / Bk;
-
   const std::int64_t nflops =
       (complex_T ? 8 : 2)  // 1 multiply takes 6/1 flops for complex/real
                            // 1 add takes 2/1 flops for complex/real
@@ -64,16 +60,16 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
 
   // Construct TiledRange
   std::vector<unsigned int> blocking_m;
-  blocking_m.reserve(Tm + 1);
   for (long i = 0l; i <= Nm; i += Bm) blocking_m.push_back(i);
+  const std::size_t Tm = blocking_m.size() - 1;
 
   std::vector<unsigned int> blocking_n;
-  blocking_n.reserve(Tn + 1);
   for (long i = 0l; i <= Nn; i += Bn) blocking_n.push_back(i);
+  const std::size_t Tn = blocking_n.size() - 1;
 
   std::vector<unsigned int> blocking_k;
-  blocking_k.reserve(Tk + 1);
   for (long i = 0l; i <= Nk; i += Bk) blocking_k.push_back(i);
+  const std::size_t Tk = blocking_k.size();
 
   // Structure of c
   std::vector<TiledArray::TiledRange1> blocking_C;
@@ -255,11 +251,6 @@ int try_main(int argc, char **argv) {
     std::cerr << "Error: block sizes must be greater than zero.\n";
     return 1;
   }
-  if ((Nm % Bm) != 0ul || Nn % Bn != 0ul || Nk % Bk != 0ul) {
-    std::cerr
-        << "Error: dimension size must be evenly divisible by block size.\n";
-    return 1;
-  }
   const long nrepeat = (argc >= 8 ? atol(argv[7]) : 5);
   if (nrepeat <= 0) {
     std::cerr << "Error: number of repetitions must be greater than zero.\n";

diff --git a/examples/gemm/ta_cc_abcd.cpp b/examples/gemm/ta_cc_abcd.cpp
@@ -34,17 +34,30 @@ bool to_bool(const char* str) {
 // if n = average tile size
 // this will produce tiles of these sizes: n+1, n-1, n+2, n-2, etc.
 // the last tile absorbs the remainder
-std::vector<unsigned int> make_tiling(unsigned int range_size,
-                                      unsigned int ntiles) {
-  const int average_tile_size = range_size / ntiles;
-  std::vector<unsigned int> result(ntiles + 1);
-  result[0] = 0;
-  for (long t = 0; t != ntiles - 1; ++t) {
-    result[t + 1] = result[t] + average_tile_size +
-                    std::max(static_cast<int>((t % 2 == 0) ? (t + 1) : (-t)),
-                             1 - average_tile_size);
+std::vector<unsigned int> make_nonuniform_tiling(unsigned int range_size,
+                                                 int tile_size) {
+  std::vector<unsigned int> result;
+  result.push_back(0);
+  for (long t = 0; true; ++t) {
+    unsigned int next_tile_boundary =
+        result.back() + tile_size +
+        std::max(static_cast<int>((t % 2 == 0) ? (t + 1) : (-t)),
+                 1 - tile_size);
+    if (next_tile_boundary >= range_size) break;
+    result.push_back(next_tile_boundary);
   }
-  result[ntiles] = range_size;
+  if (result.back() != range_size) result.push_back(range_size);
+  return result;
+}
+
+// makes tiles as uniform as possible
+std::vector<unsigned int> make_uniform_tiling(unsigned int range_size,
+                                              int tile_size) {
+  std::vector<unsigned int> result;
+  for (unsigned int t = 0; t <= range_size; t += tile_size) {
+    result.push_back(t);
+  }
+  if (result.back() != range_size) result.push_back(range_size);
   return result;
 }
 
@@ -72,70 +85,93 @@ int main(int argc, char** argv) {
 
     // Get command line arguments
     if (argc < 5) {
-      std::cout << "Mocks t2(i,a,j,b) * v(a,b,c,d) term in CC amplitude eqs"
-                << std::endl
-                << "Usage: " << argv[0]
-                << " occ_size occ_nblocks uocc_size "
-                   "uocc_nblocks [repetitions] [use_complex]"
-                << std::endl;
+      std::cout
+          << "Mocks t2(i,j,a,b) * v(a,b,c,d) term in CC amplitude eqs"
+          << std::endl
+          << "Usage: " << argv[0]
+          << " occ_size occ_tilesize uocc_size "
+             "uocc_tilesize [repetitions] [scalar=double] [uniform_tiling=1]"
+          << std::endl;
       return 0;
     }
     const long n_occ = atol(argv[1]);
-    const long nblk_occ = atol(argv[2]);
+    const long b_occ = atol(argv[2]);
     const long n_uocc = atol(argv[3]);
-    const long nblk_uocc = atol(argv[4]);
+    const long b_uocc = atol(argv[4]);
     if (n_occ <= 0) {
       std::cerr << "Error: occ_size must be greater than zero.\n";
       return 1;
     }
-    if (nblk_occ <= 0) {
-      std::cerr << "Error: occ_nblocks must be greater than zero.\n";
+    if (b_occ <= 0) {
+      std::cerr << "Error: occ_tilesize must be greater than zero.\n";
       return 1;
     }
     if (n_uocc <= 0) {
       std::cerr << "Error: uocc_size must be greater than zero.\n";
       return 1;
     }
-    if (nblk_uocc <= 0) {
-      std::cerr << "Error: uocc_nblocks must be greater than zero.\n";
-      return 1;
-    }
-    if ((n_occ < nblk_occ) != 0ul) {
-      std::cerr << "Error: occ_size must be greater than occ_nblocks.\n";
-      return 1;
-    }
-    if ((n_uocc < nblk_uocc) != 0ul) {
-      std::cerr << "Error: uocc_size must be greater than uocc_nblocks.\n";
+    if (b_uocc <= 0) {
+      std::cerr << "Error: uocc_tilesize must be greater than zero.\n";
       return 1;
     }
     const long repeat = (argc >= 6 ? atol(argv[5]) : 5);
     if (repeat <= 0) {
       std::cerr << "Error: number of repetitions must be greater than zero.\n";
       return 1;
     }
-    const bool use_complex = (argc >= 7 ? to_bool(argv[6]) : false);
+
+    const std::string scalar_type_str = (argc >= 7 ? argv[6] : "double");
+    if (scalar_type_str != "double" && scalar_type_str != "float" &&
+        scalar_type_str != "zdouble" && scalar_type_str != "zfloat") {
+      std::cerr << "Error: invalid real type " << scalar_type_str << ".\n";
+      std::cerr << "       valid real types are \"double\", \"float\", "
+                   "\"zdouble\", and \"zfloat\".\n";
+      return 1;
+    }
+
+    const bool uniform_tiling = (argc >= 8 ? std::atol(argv[7]) : true);
 
     if (world.rank() == 0)
       std::cout << "TiledArray: CC T2.V term test..."
                 << "\nGit description: " << TiledArray::git_description()
                 << "\nNumber of nodes     = " << world.size()
                 << "\nocc size            = " << n_occ
-                << "\nocc nblocks         = " << nblk_occ
+                << "\nocc tilesize        = " << b_occ
                 << "\nuocc size           = " << n_uocc
-                << "\nuocc nblocks        = " << nblk_uocc
-                << "\nComplex             = "
-                << (use_complex ? "true" : "false") << "\n";
+                << "\nuocc tilesize       = " << b_uocc
+                << "\nscalar type         = " << scalar_type_str
+                << "\nuniform tiling      = "
+                << (uniform_tiling ? "true" : "false") << std::endl;
 
     // Construct TiledRange1's
-    std::vector<unsigned int> tiling_occ = make_tiling(n_occ, nblk_occ);
-    std::vector<unsigned int> tiling_uocc = make_tiling(n_uocc, nblk_uocc);
+    std::vector<unsigned int> tiling_occ =
+        uniform_tiling ? make_uniform_tiling(n_occ, b_occ)
+                       : make_nonuniform_tiling(n_occ, b_occ);
+    std::vector<unsigned int> tiling_uocc =
+        uniform_tiling ? make_uniform_tiling(n_uocc, b_uocc)
+                       : make_nonuniform_tiling(n_uocc, b_uocc);
     auto trange_occ = TA::TiledRange1(tiling_occ.begin(), tiling_occ.end());
     auto trange_uocc = TA::TiledRange1(tiling_uocc.begin(), tiling_uocc.end());
-
-    if (use_complex)
-      cc_abcd<std::complex<double>>(world, trange_occ, trange_uocc, repeat);
-    else
+    auto print_tile_sizes = [](const auto& tiling) {
+      auto b = tiling.begin();
+      for (auto current = b + 1; current != tiling.end(); ++current) {
+        std::cout << *current - *(current - 1) << " ";
+      }
+      std::cout << std::endl;
+    };
+    std::cout << " occ tile sizes: ";
+    print_tile_sizes(tiling_occ);
+    std::cout << "uocc tile sizes: ";
+    print_tile_sizes(tiling_uocc);
+
+    if (scalar_type_str == "double")
       cc_abcd<double>(world, trange_occ, trange_uocc, repeat);
+    else if (scalar_type_str == "zdouble")
+      cc_abcd<std::complex<double>>(world, trange_occ, trange_uocc, repeat);
+    else if (scalar_type_str == "float")
+      cc_abcd<float>(world, trange_occ, trange_uocc, repeat);
+    else if (scalar_type_str == "zfloat")
+      cc_abcd<std::complex<float>>(world, trange_occ, trange_uocc, repeat);
 
     TA::finalize();
 
@@ -175,13 +211,13 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
   const double flops_per_fma =
       (complex_T ? 8 : 2);  // 1 multiply takes 6/1 flops for complex/real
                             // 1 add takes 2/1 flops for complex/real
-  const double gflops_per_call = flops_per_fma * std::pow(n_occ, 2) *
-                                 std::pow(n_uocc, 4) / std::pow(1024., 3);
+  const double gflops_per_call =
+      flops_per_fma * std::pow(n_occ, 2) * std::pow(n_uocc, 4) / 1e9;
 
   // Construct tensors
-  TA::TArrayD t2(world, trange_oovv);
-  TA::TArrayD v(world, trange_vvvv);
-  TA::TArrayD t2_v;
+  TA::TSpArray<T> t2(world, trange_oovv);
+  TA::TSpArray<T> v(world, trange_vvvv);
+  TA::TSpArray<T> t2_v;
   // To validate, fill input tensors with random data, otherwise just with 1s
   if (do_validate) {
     rand_fill_array(t2);
@@ -201,25 +237,26 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
   for (int i = 0; i < repeat; ++i) {
     auto tp_start = TiledArray::now();
     // this is how the user would express this contraction
-    if (false) t2_v("i,j,a,b") = t2("i,j,c,d") * v("a,b,c,d");
+    if (true) t2_v("i,j,a,b") = t2("i,j,c,d") * v("a,b,c,d");
 
     // this demonstrates to the PaRSEC team what happens under the hood of the
     // expression above
-    if (true) {
+    if (false) {
       tensor_contract_444(t2_v, t2, v);
 
       // to validate replace: false -> true
       if (do_validate) {
         // obtain reference result using the high-level DSL
-        TA::TArrayD t2_v_ref;
+        TA::TSpArray<T> t2_v_ref;
         t2_v_ref("i,j,a,b") = t2("i,j,c,d") * v("c,d,a,b");
-        TA::TArrayD error;
+        TA::TSpArray<T> error;
         error("i,j,a,b") = t2_v_ref("i,j,a,b") - t2_v("i,j,a,b");
         std::cout << "Validating the result (ignore the timings/performance!): "
                      "||ref_result - result||_2^2 = "
                   << error("i,j,a,b").squared_norm().get() << std::endl;
       }
     }
+    t2_v.world().gop.fence();
     TiledArray::record_duration_since(tp_start);
 
     const double time = TiledArray::durations().back();
@@ -357,6 +394,7 @@ template <typename Tile, typename Policy>
 void tensor_contract_444(TA::DistArray<Tile, Policy>& tv,
                          const TA::DistArray<Tile, Policy>& t,
                          const TA::DistArray<Tile, Policy>& v) {
+  using Shape = typename Policy::shape_type;
   // for convenience, obtain the tiled ranges for the two kinds of dimensions
   // used to define t, v, and tv
   auto trange_occ = t.trange().dim(0);   // the first dimension of t is occ
@@ -378,10 +416,10 @@ void tensor_contract_444(TA::DistArray<Tile, Policy>& tv,
   auto ncols = n_uocc * n_uocc;
   TA::detail::ProcGrid proc_grid(world, nrowtiles, ncoltiles, nrows, ncols);
   std::shared_ptr<TA::Pmap> pmap;
-  auto t_eval = make_array_eval(t, t.world(), TA::DenseShape(),
+  auto t_eval = make_array_eval(t, t.world(), Shape(),
                                 proc_grid.make_row_phase_pmap(ninttiles),
                                 TA::Permutation(), make_array_noop<Tile>());
-  auto v_eval = make_array_eval(v, v.world(), TA::DenseShape(),
+  auto v_eval = make_array_eval(v, v.world(), Shape(),
                                 proc_grid.make_col_phase_pmap(ninttiles),
                                 TA::Permutation(), make_array_noop<Tile>());
 
@@ -400,7 +438,7 @@ void tensor_contract_444(TA::DistArray<Tile, Policy>& tv,
   // 2. there will be a dummy output ArrayEval, its Futures will be set by the
   // PTG
   auto contract =
-      make_contract_eval(t_eval, v_eval, world, TA::DenseShape(), pmap,
+      make_contract_eval(t_eval, v_eval, world, Shape(), pmap,
                          TA::Permutation(), make_contract<Tile>(4u, 4u, 4u));
 
   // eval() just schedules the Summa task and proceeds

diff --git a/examples/gemm/ta_dense_asymm.cpp b/examples/gemm/ta_dense_asymm.cpp
@@ -50,11 +50,6 @@ int main(int argc, char** argv) {
     std::cerr << "Error: block sizes must be greater than zero.\n";
     return 1;
   }
-  if ((Nm % Bm) != 0ul || Nn % Bn != 0ul || Nk % Bk != 0ul) {
-    std::cerr
-        << "Error: dimension size must be evenly divisible by block size.\n";
-    return 1;
-  }
   const long repeat = (argc >= 8 ? atol(argv[7]) : 5);
   if (repeat <= 0) {
     std::cerr << "Error: number of repetitions must be greater than zero.\n";
@@ -72,22 +67,22 @@ int main(int argc, char** argv) {
 
   const bool do_memtrace = (argc >= 10 ? std::atol(argv[9]) : false);
 
-  const std::size_t Tm = Nm / Bm;
-  const std::size_t Tn = Nn / Bn;
-  const std::size_t Tk = Nk / Bk;
-
   // Construct TiledRange
   std::vector<unsigned int> blocking_m;
-  blocking_m.reserve(Tm + 1);
   for (long i = 0l; i <= Nm; i += Bm) blocking_m.push_back(i);
+  if (blocking_m.back() != Nm) blocking_m.push_back(Nm);
 
   std::vector<unsigned int> blocking_n;
-  blocking_n.reserve(Tn + 1);
   for (long i = 0l; i <= Nn; i += Bn) blocking_n.push_back(i);
+  if (blocking_n.back() != Nn) blocking_n.push_back(Nn);
 
   std::vector<unsigned int> blocking_k;
-  blocking_k.reserve(Tk + 1);
   for (long i = 0l; i <= Nk; i += Bk) blocking_k.push_back(i);
+  if (blocking_k.back() != Nk) blocking_k.push_back(Nk);
+
+  const std::size_t Tm = blocking_m.size() - 1;
+  const std::size_t Tn = blocking_n.size() - 1;
+  const std::size_t Tk = blocking_k.size() - 1;
 
   // Structure of c
   std::vector<TiledArray::TiledRange1> blocking_C;
@@ -138,13 +133,13 @@ int main(int argc, char** argv) {
                 << "\nScalar type       = " << scalar_type_str
                 << "\nSize of A         = " << Nm << "x" << Nk << " ("
                 << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)"
-                << "\nSize of A block   = " << Bm << "x" << Bk
+                << "\nSize of (largest) A block   = " << Bm << "x" << Bk
                 << "\nSize of B         = " << Nk << "x" << Nn << " ("
                 << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)"
-                << "\nSize of B block   = " << Bk << "x" << Bn
+                << "\nSize of (largest) B block   = " << Bk << "x" << Bn
                 << "\nSize of C         = " << Nm << "x" << Nn << " ("
                 << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)"
-                << "\nSize of C block   = " << Bm << "x" << Bn
+                << "\nSize of (largest) C block   = " << Bm << "x" << Bn
                 << "\n# of blocks of C  = " << Tm * Tn
                 << "\nAverage # of blocks of C/node = "
                 << double(Tm * Tn) / double(world.size()) << "\n";
@@ -153,10 +148,11 @@ int main(int argc, char** argv) {
       if (do_memtrace) {
         world.gop.fence();
         madness::print_meminfo(world.rank(), str);
+      } else {
+        world.gop.fence();
       }
 #ifdef TA_TENSOR_MEM_PROFILE
       {
-        world.gop.fence();
         std::cout
             << str << ": TA::Tensor allocated "
             << TA::hostEnv::instance()->host_allocator_getActualHighWatermark()