ValeevGroup · evaleev · Mar 21, 2023 · Mar 21, 2023 · Mar 21, 2023
diff --git a/examples/dgemm/ta_dense_asymm.cpp b/examples/dgemm/ta_dense_asymm.cpp
@@ -31,7 +31,8 @@ int main(int argc, char** argv) {
                  "blocked by Bm, Bn, and Bk, respectively"
               << std::endl
               << "Usage: " << argv[0]
-              << " Nm Bm Nn Bn Nk Bk [repetitions=5] [real=double]\n";
+              << " Nm Bm Nn Bn Nk Bk [repetitions=5] [scalar=double] "
+                 "[do_memtrace=0]\n";
     return 0;
   }
   const long Nm = atol(argv[1]);
@@ -59,19 +60,25 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  const std::string real_type_str = (argc >= 9 ? argv[8] : "double");
-  if (real_type_str != "double" && real_type_str != "float") {
-    std::cerr << "Error: invalid real type " << real_type_str << ".\n";
+  const std::string scalar_type_str = (argc >= 9 ? argv[8] : "double");
+  if (scalar_type_str != "double" && scalar_type_str != "float" &&
+      scalar_type_str != "zdouble" && scalar_type_str != "zfloat") {
+    std::cerr << "Error: invalid real type " << scalar_type_str << ".\n";
+    std::cerr << "       valid real types are \"double\", \"float\", "
+                 "\"zdouble\", and \"zfloat\".\n";
     return 1;
   }
 
+  const bool do_memtrace = (argc >= 10 ? std::atol(argv[9]) : false);
+
   const std::size_t Tm = Nm / Bm;
   const std::size_t Tn = Nn / Bn;
   const std::size_t Tk = Nk / Bk;
 
   if (world.rank() == 0)
     std::cout << "TiledArray: dense matrix multiply test...\n"
               << "Number of nodes     = " << world.size()
+              << "\nScalar type       = " << scalar_type_str
               << "\nSize of A         = " << Nm << "x" << Nk << " ("
               << double(Nm * Nk * sizeof(double)) / 1.0e9 << " GB)"
               << "\nSize of A block   = " << Bm << "x" << Bk
@@ -133,54 +140,112 @@ int main(int argc, char** argv) {
 
   auto run = [&](auto* tarray_ptr) {
     using Array = std::decay_t<std::remove_pointer_t<decltype(tarray_ptr)>>;
-
-    // Construct and initialize arrays
-    Array a(world, trange_a);
-    Array b(world, trange_b);
-    Array c(world, trange_c);
-    a.fill(1.0);
-    b.fill(1.0);
-
-    // Start clock
-    world.gop.fence();
-    const double wall_time_start = madness::wall_time();
-
-    // Do matrix multiplication
-    for (int i = 0; i < repeat; ++i) {
-      c("m,n") = a("m,k") * b("k,n");
+    using scalar_type = TiledArray::detail::scalar_t<Array>;
+
+    const auto complex_T = TiledArray::detail::is_complex_v<scalar_type>;
+    const std::int64_t nflops =
+        (complex_T ? 8 : 2)  // 1 multiply takes 6/1 flops for complex/real
+                             // 1 add takes 2/1 flops for complex/real
+        * static_cast<std::int64_t>(Nn) * static_cast<std::int64_t>(Nm) *
+        static_cast<std::int64_t>(Nk);
+
+    auto memtrace = [do_memtrace, &world](const std::string& str) -> void {
+      if (do_memtrace) {
+        world.gop.fence();
+        madness::print_meminfo(world.rank(), str);
+      }
+#ifdef TA_TENSOR_MEM_PROFILE
+      {
+        world.gop.fence();
+        std::cout
+            << str << ": TA::Tensor allocated "
+            << TA::hostEnv::instance()->host_allocator_getActualHighWatermark()
+            << " bytes and used "
+            << TA::hostEnv::instance()->host_allocator().getHighWatermark()
+            << " bytes" << std::endl;
+      }
+#endif
+    };
+
+    memtrace("start");
+    {  // array lifetime scope
+      // Construct and initialize arrays
+      Array a(world, trange_a);
+      Array b(world, trange_b);
+      Array c(world, trange_c);
+      a.fill(1.0);
+      b.fill(1.0);
+      memtrace("allocated a and b");
+
+      // Start clock
       world.gop.fence();
-      if (world.rank() == 0) std::cout << "Iteration " << i + 1 << "\n";
-    }
-
-    // Stop clock
-    const double wall_time_stop = madness::wall_time();
-
-    if (world.rank() == 0)
-      std::cout << "Average wall time   = "
-                << (wall_time_stop - wall_time_start) / double(repeat)
-                << " sec\nAverage GFLOPS      = "
-                << double(repeat) * 2.0 * double(Nn * Nm * Nk) /
-                       (wall_time_stop - wall_time_start) / 1.0e9
-                << "\n";
+      if (world.rank() == 0)
+        std::cout << "Starting iterations: "
+                  << "\n";
+
+      double total_time = 0.0;
+      double total_gflop_rate = 0.0;
+
+      // Do matrix multiplication
+      for (int i = 0; i < repeat; ++i) {
+        const double start = madness::wall_time();
+        c("m,n") = a("m,k") * b("k,n");
+        memtrace("c=a*b");
+        const double time = madness::wall_time() - start;
+        total_time += time;
+        const double gflop_rate = double(nflops) / (time * 1.e9);
+        total_gflop_rate += gflop_rate;
+        if (world.rank() == 0)
+          std::cout << "Iteration " << i + 1 << "   time=" << time
+                    << "   GFLOPS=" << gflop_rate << "\n";
+      }
+
+      // Stop clock
+      const double wall_time_stop = madness::wall_time();
+
+      if (world.rank() == 0) {
+        std::cout << "Average wall time   = " << total_time / double(repeat)
+                  << " sec\nAverage GFLOPS      = "
+                  << total_gflop_rate / double(repeat) << "\n";
+      }
+
+    }  // array lifetime scope
+    memtrace("stop");
   };
 
   // by default use TiledArray tensors
   constexpr bool use_btas = false;
   // btas::Tensor instead
-  if (real_type_str == "double") {
+  if (scalar_type_str == "double") {
     if constexpr (!use_btas)
       run(static_cast<TiledArray::TArrayD*>(nullptr));
     else
       run(static_cast<TiledArray::DistArray<
               TiledArray::Tile<btas::Tensor<double, TiledArray::Range>>>*>(
           nullptr));
-  } else {
+  } else if (scalar_type_str == "float") {
     if constexpr (!use_btas)
       run(static_cast<TiledArray::TArrayF*>(nullptr));
     else
       run(static_cast<TiledArray::DistArray<
               TiledArray::Tile<btas::Tensor<float, TiledArray::Range>>>*>(
           nullptr));
+  } else if (scalar_type_str == "zdouble") {
+    if constexpr (!use_btas)
+      run(static_cast<TiledArray::TArrayZ*>(nullptr));
+    else
+      run(static_cast<TiledArray::DistArray<TiledArray::Tile<
+              btas::Tensor<std::complex<double>, TiledArray::Range>>>*>(
+          nullptr));
+  } else if (scalar_type_str == "zfloat") {
+    if constexpr (!use_btas)
+      run(static_cast<TiledArray::TArrayC*>(nullptr));
+    else
+      run(static_cast<TiledArray::DistArray<TiledArray::Tile<
+              btas::Tensor<std::complex<float>, TiledArray::Range>>>*>(
+          nullptr));
+  } else {
+    abort();  // unreachable
   }
 
   return 0;

diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h
@@ -646,34 +646,34 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1,
 /// \tparam ReduceOp The element-wise reduction
 /// operation type
 /// \tparam JoinOp The result operation type
-/// \tparam Scalar A
-/// scalar type
+/// \tparam Identity A type that can be used as an argument to ReduceOp
 /// \tparam T1 The first argument tensor type
-/// \tparam Ts The
-/// argument tensor types
+/// \tparam Ts The argument tensor types
 /// \param reduce_op The element-wise reduction operation
 /// \param identity The initial value for the reduction and the result
 /// \param tensor1 The first tensor to be reduced
 /// \param tensors The other tensors to be reduced
 /// \return The reduced value of the tensor(s)
 template <
-    typename ReduceOp, typename JoinOp, typename Scalar, typename T1,
+    typename ReduceOp, typename JoinOp, typename Identity, typename T1,
     typename... Ts,
     typename std::enable_if_t<
         is_tensor<T1, Ts...>::value && is_contiguous_tensor<T1, Ts...>::value &&
-        !is_reduce_op_v<std::decay_t<ReduceOp>, std::decay_t<Scalar>,
+        !is_reduce_op_v<std::decay_t<ReduceOp>, std::decay_t<Identity>,
                         std::decay_t<T1>, std::decay_t<Ts>...>>* = nullptr>
-Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
-                     const T1& tensor1, const Ts&... tensors) {
+auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Identity&& identity,
+                   const T1& tensor1, const Ts&... tensors) {
   TA_ASSERT(!empty(tensor1, tensors...));
   TA_ASSERT(is_range_set_congruent(tensor1, tensors...));
 
   const auto volume = tensor1.range().volume();
 
-  math::reduce_op(reduce_op, join_op, identity, volume, identity,
+  auto init = std::forward<Identity>(identity);
+  math::reduce_op(std::forward<ReduceOp>(reduce_op),
+                  std::forward<JoinOp>(join_op), init, volume, init,
                   tensor1.data(), tensors.data()...);
 
-  return identity;
+  return init;
 }
 
 /// Reduction operation for tensors
@@ -698,8 +698,8 @@ template <
         is_tensor<T1, Ts...>::value && is_contiguous_tensor<T1, Ts...>::value &&
         is_reduce_op_v<std::decay_t<ReduceOp>, std::decay_t<Scalar>,
                        std::decay_t<T1>, std::decay_t<Ts>...>>* = nullptr>
-Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
-                     const T1& tensor1, const Ts&... tensors) {
+auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
+                   const T1& tensor1, const Ts&... tensors) {
   reduce_op(identity, &tensor1, &tensors...);
   return identity;
 }
@@ -723,13 +723,14 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
 /// \param tensor1 The first tensor to be reduced
 /// \param tensors The other tensors to be reduced
 /// \return The reduced value of the tensor(s)
-template <typename ReduceOp, typename JoinOp, typename Scalar, typename T1,
+template <typename ReduceOp, typename JoinOp, typename Identity, typename T1,
           typename... Ts,
           typename std::enable_if<
               is_tensor_of_tensor<T1, Ts...>::value &&
               is_contiguous_tensor<T1, Ts...>::value>::type* = nullptr>
-Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
-                     const T1& tensor1, const Ts&... tensors) {
+auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
+                   const Identity& identity, const T1& tensor1,
+                   const Ts&... tensors) {
   TA_ASSERT(!empty(tensor1, tensors...));
   TA_ASSERT(is_range_set_congruent(tensor1, tensors...));
 
@@ -765,24 +766,24 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
 /// \param tensor1 The first tensor to be reduced
 /// \param tensors The other tensors to be reduced
 /// \return The reduced value of the tensor(s)
-template <typename ReduceOp, typename JoinOp, typename Scalar, typename T1,
+template <typename ReduceOp, typename JoinOp, typename Identity, typename T1,
           typename... Ts,
           typename std::enable_if<
               is_tensor<T1, Ts...>::value &&
               !is_contiguous_tensor<T1, Ts...>::value>::type* = nullptr>
-Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
-                     const Scalar identity, const T1& tensor1,
-                     const Ts&... tensors) {
+auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
+                   const Identity& identity, const T1& tensor1,
+                   const Ts&... tensors) {
   TA_ASSERT(!empty(tensor1, tensors...));
   TA_ASSERT(is_range_set_congruent(tensor1, tensors...));
 
   const auto stride = inner_size(tensor1, tensors...);
   const auto volume = tensor1.range().volume();
 
-  Scalar result = identity;
+  auto result = identity;
   for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume;
        ord += stride) {
-    Scalar temp = identity;
+    auto temp = identity;
     math::reduce_op(reduce_op, join_op, identity, stride, temp,
                     tensor1.data() + tensor1.range().ordinal(ord),
                     (tensors.data() + tensors.range().ordinal(ord))...);

diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h
@@ -2174,18 +2174,19 @@ class Tensor {
   /// identity . If HAVE_INTEL_TBB is defined, and this is a contiguous tensor,
   /// the reduction will be executed in an undefined order, otherwise will
   /// execute in the order of increasing \c i .
-  /// \tparam ReduceOp The reduction
-  /// operation type
+  /// \tparam ReduceOp The reduction operation type
   /// \tparam JoinOp The join operation type
-  /// \param reduce_op The
-  /// element-wise reduction operation
+  /// \tparam T a type that can be used as argument to ReduceOp
+  /// \param reduce_op The element-wise reduction operation
   /// \param join_op The join result operation
   /// \param identity The identity value of the reduction
   /// \return The reduced value
-  template <typename ReduceOp, typename JoinOp, typename Scalar>
-  decltype(auto) reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
-                        Scalar identity) const {
-    return detail::tensor_reduce(reduce_op, join_op, identity, *this);
+  template <typename ReduceOp, typename JoinOp, typename Identity>
+  auto reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
+              Identity&& identity) const {
+    return detail::tensor_reduce(std::forward<ReduceOp>(reduce_op),
+                                 std::forward<JoinOp>(join_op),
+                                 std::forward<Identity>(identity), *this);
   }
 
   /// Binary reduction operation
@@ -2196,22 +2197,23 @@ class Tensor {
   /// \c identity . If HAVE_INTEL_TBB is defined, and this is a contiguous
   /// tensor, the reduction will be executed in an undefined order, otherwise
   /// will execute in the order of increasing \c i .
-  /// \tparam Right The
-  /// right-hand argument tensor type
-  /// \tparam ReduceOp The reduction operation
-  /// type
+  /// \tparam Right The right-hand argument tensor type
+  /// \tparam ReduceOp The reduction operation type
   /// \tparam JoinOp The join operation type
-  /// \param other The right-hand
-  /// argument of the binary reduction
-  /// \param reduce_op The element-wise
-  /// reduction operation \param join_op The join result operation
+  /// \tparam Identity A type that can be used as argument to ReduceOp
+  /// \param other The right-hand argument of the binary reduction
+  /// \param reduce_op The element-wise reduction operation
+  /// \param join_op The join result operation
   /// \param identity The identity value of the reduction
   /// \return The reduced value
-  template <typename Right, typename ReduceOp, typename JoinOp, typename Scalar,
+  template <typename Right, typename ReduceOp, typename JoinOp,
+            typename Identity,
             typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
-  decltype(auto) reduce(const Right& other, ReduceOp&& reduce_op,
-                        JoinOp&& join_op, Scalar identity) const {
-    return detail::tensor_reduce(reduce_op, join_op, identity, *this, other);
+  auto reduce(const Right& other, ReduceOp&& reduce_op, JoinOp&& join_op,
+              Identity&& identity) const {
+    return detail::tensor_reduce(
+        std::forward<ReduceOp>(reduce_op), std::forward<JoinOp>(join_op),
+        std::forward<Identity>(identity), *this, other);
   }
 
   /// Sum of elements