Skip to content

Commit

Permalink
Merge pull request #397 from ValeevGroup/evaleev/feature/complex_ta_d…
Browse files Browse the repository at this point in the history
…ense_asymm

complex ta dense asymm
  • Loading branch information
evaleev authored Mar 21, 2023
2 parents aea54ca + b214439 commit a5a225b
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 105 deletions.
133 changes: 99 additions & 34 deletions examples/dgemm/ta_dense_asymm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ int main(int argc, char** argv) {
"blocked by Bm, Bn, and Bk, respectively"
<< std::endl
<< "Usage: " << argv[0]
<< " Nm Bm Nn Bn Nk Bk [repetitions=5] [real=double]\n";
<< " Nm Bm Nn Bn Nk Bk [repetitions=5] [scalar=double] "
"[do_memtrace=0]\n";
return 0;
}
const long Nm = atol(argv[1]);
Expand Down Expand Up @@ -59,19 +60,25 @@ int main(int argc, char** argv) {
return 1;
}

const std::string real_type_str = (argc >= 9 ? argv[8] : "double");
if (real_type_str != "double" && real_type_str != "float") {
std::cerr << "Error: invalid real type " << real_type_str << ".\n";
const std::string scalar_type_str = (argc >= 9 ? argv[8] : "double");
if (scalar_type_str != "double" && scalar_type_str != "float" &&
scalar_type_str != "zdouble" && scalar_type_str != "zfloat") {
std::cerr << "Error: invalid real type " << scalar_type_str << ".\n";
std::cerr << " valid real types are \"double\", \"float\", "
"\"zdouble\", and \"zfloat\".\n";
return 1;
}

const bool do_memtrace = (argc >= 10 ? std::atol(argv[9]) : false);

const std::size_t Tm = Nm / Bm;
const std::size_t Tn = Nn / Bn;
const std::size_t Tk = Nk / Bk;

if (world.rank() == 0)
std::cout << "TiledArray: dense matrix multiply test...\n"
<< "Number of nodes = " << world.size()
<< "\nScalar type = " << scalar_type_str
<< "\nSize of A = " << Nm << "x" << Nk << " ("
<< double(Nm * Nk * sizeof(double)) / 1.0e9 << " GB)"
<< "\nSize of A block = " << Bm << "x" << Bk
Expand Down Expand Up @@ -133,54 +140,112 @@ int main(int argc, char** argv) {

auto run = [&](auto* tarray_ptr) {
using Array = std::decay_t<std::remove_pointer_t<decltype(tarray_ptr)>>;

// Construct and initialize arrays
Array a(world, trange_a);
Array b(world, trange_b);
Array c(world, trange_c);
a.fill(1.0);
b.fill(1.0);

// Start clock
world.gop.fence();
const double wall_time_start = madness::wall_time();

// Do matrix multiplication
for (int i = 0; i < repeat; ++i) {
c("m,n") = a("m,k") * b("k,n");
using scalar_type = TiledArray::detail::scalar_t<Array>;

const auto complex_T = TiledArray::detail::is_complex_v<scalar_type>;
const std::int64_t nflops =
(complex_T ? 8 : 2) // 1 multiply takes 6/1 flops for complex/real
// 1 add takes 2/1 flops for complex/real
* static_cast<std::int64_t>(Nn) * static_cast<std::int64_t>(Nm) *
static_cast<std::int64_t>(Nk);

auto memtrace = [do_memtrace, &world](const std::string& str) -> void {
if (do_memtrace) {
world.gop.fence();
madness::print_meminfo(world.rank(), str);
}
#ifdef TA_TENSOR_MEM_PROFILE
{
world.gop.fence();
std::cout
<< str << ": TA::Tensor allocated "
<< TA::hostEnv::instance()->host_allocator_getActualHighWatermark()
<< " bytes and used "
<< TA::hostEnv::instance()->host_allocator().getHighWatermark()
<< " bytes" << std::endl;
}
#endif
};

memtrace("start");
{ // array lifetime scope
// Construct and initialize arrays
Array a(world, trange_a);
Array b(world, trange_b);
Array c(world, trange_c);
a.fill(1.0);
b.fill(1.0);
memtrace("allocated a and b");

// Start clock
world.gop.fence();
if (world.rank() == 0) std::cout << "Iteration " << i + 1 << "\n";
}

// Stop clock
const double wall_time_stop = madness::wall_time();

if (world.rank() == 0)
std::cout << "Average wall time = "
<< (wall_time_stop - wall_time_start) / double(repeat)
<< " sec\nAverage GFLOPS = "
<< double(repeat) * 2.0 * double(Nn * Nm * Nk) /
(wall_time_stop - wall_time_start) / 1.0e9
<< "\n";
if (world.rank() == 0)
std::cout << "Starting iterations: "
<< "\n";

double total_time = 0.0;
double total_gflop_rate = 0.0;

// Do matrix multiplication
for (int i = 0; i < repeat; ++i) {
const double start = madness::wall_time();
c("m,n") = a("m,k") * b("k,n");
memtrace("c=a*b");
const double time = madness::wall_time() - start;
total_time += time;
const double gflop_rate = double(nflops) / (time * 1.e9);
total_gflop_rate += gflop_rate;
if (world.rank() == 0)
std::cout << "Iteration " << i + 1 << " time=" << time
<< " GFLOPS=" << gflop_rate << "\n";
}

// Stop clock
const double wall_time_stop = madness::wall_time();

if (world.rank() == 0) {
std::cout << "Average wall time = " << total_time / double(repeat)
<< " sec\nAverage GFLOPS = "
<< total_gflop_rate / double(repeat) << "\n";
}

} // array lifetime scope
memtrace("stop");
};

// by default use TiledArray tensors
constexpr bool use_btas = false;
// btas::Tensor instead
if (real_type_str == "double") {
if (scalar_type_str == "double") {
if constexpr (!use_btas)
run(static_cast<TiledArray::TArrayD*>(nullptr));
else
run(static_cast<TiledArray::DistArray<
TiledArray::Tile<btas::Tensor<double, TiledArray::Range>>>*>(
nullptr));
} else {
} else if (scalar_type_str == "float") {
if constexpr (!use_btas)
run(static_cast<TiledArray::TArrayF*>(nullptr));
else
run(static_cast<TiledArray::DistArray<
TiledArray::Tile<btas::Tensor<float, TiledArray::Range>>>*>(
nullptr));
} else if (scalar_type_str == "zdouble") {
if constexpr (!use_btas)
run(static_cast<TiledArray::TArrayZ*>(nullptr));
else
run(static_cast<TiledArray::DistArray<TiledArray::Tile<
btas::Tensor<std::complex<double>, TiledArray::Range>>>*>(
nullptr));
} else if (scalar_type_str == "zfloat") {
if constexpr (!use_btas)
run(static_cast<TiledArray::TArrayC*>(nullptr));
else
run(static_cast<TiledArray::DistArray<TiledArray::Tile<
btas::Tensor<std::complex<float>, TiledArray::Range>>>*>(
nullptr));
} else {
abort(); // unreachable
}

return 0;
Expand Down
43 changes: 22 additions & 21 deletions src/TiledArray/tensor/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -646,34 +646,34 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1,
/// \tparam ReduceOp The element-wise reduction
/// operation type
/// \tparam JoinOp The result operation type
/// \tparam Scalar A
/// scalar type
/// \tparam Identity A type that can be used as an argument to ReduceOp
/// \tparam T1 The first argument tensor type
/// \tparam Ts The
/// argument tensor types
/// \tparam Ts The argument tensor types
/// \param reduce_op The element-wise reduction operation
/// \param identity The initial value for the reduction and the result
/// \param tensor1 The first tensor to be reduced
/// \param tensors The other tensors to be reduced
/// \return The reduced value of the tensor(s)
template <
typename ReduceOp, typename JoinOp, typename Scalar, typename T1,
typename ReduceOp, typename JoinOp, typename Identity, typename T1,
typename... Ts,
typename std::enable_if_t<
is_tensor<T1, Ts...>::value && is_contiguous_tensor<T1, Ts...>::value &&
!is_reduce_op_v<std::decay_t<ReduceOp>, std::decay_t<Scalar>,
!is_reduce_op_v<std::decay_t<ReduceOp>, std::decay_t<Identity>,
std::decay_t<T1>, std::decay_t<Ts>...>>* = nullptr>
Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
const T1& tensor1, const Ts&... tensors) {
auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Identity&& identity,
const T1& tensor1, const Ts&... tensors) {
TA_ASSERT(!empty(tensor1, tensors...));
TA_ASSERT(is_range_set_congruent(tensor1, tensors...));

const auto volume = tensor1.range().volume();

math::reduce_op(reduce_op, join_op, identity, volume, identity,
auto init = std::forward<Identity>(identity);
math::reduce_op(std::forward<ReduceOp>(reduce_op),
std::forward<JoinOp>(join_op), init, volume, init,
tensor1.data(), tensors.data()...);

return identity;
return init;
}

/// Reduction operation for tensors
Expand All @@ -698,8 +698,8 @@ template <
is_tensor<T1, Ts...>::value && is_contiguous_tensor<T1, Ts...>::value &&
is_reduce_op_v<std::decay_t<ReduceOp>, std::decay_t<Scalar>,
std::decay_t<T1>, std::decay_t<Ts>...>>* = nullptr>
Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
const T1& tensor1, const Ts&... tensors) {
auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
const T1& tensor1, const Ts&... tensors) {
reduce_op(identity, &tensor1, &tensors...);
return identity;
}
Expand All @@ -723,13 +723,14 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
/// \param tensor1 The first tensor to be reduced
/// \param tensors The other tensors to be reduced
/// \return The reduced value of the tensor(s)
template <typename ReduceOp, typename JoinOp, typename Scalar, typename T1,
template <typename ReduceOp, typename JoinOp, typename Identity, typename T1,
typename... Ts,
typename std::enable_if<
is_tensor_of_tensor<T1, Ts...>::value &&
is_contiguous_tensor<T1, Ts...>::value>::type* = nullptr>
Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
const T1& tensor1, const Ts&... tensors) {
auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
const Identity& identity, const T1& tensor1,
const Ts&... tensors) {
TA_ASSERT(!empty(tensor1, tensors...));
TA_ASSERT(is_range_set_congruent(tensor1, tensors...));

Expand Down Expand Up @@ -765,24 +766,24 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
/// \param tensor1 The first tensor to be reduced
/// \param tensors The other tensors to be reduced
/// \return The reduced value of the tensor(s)
template <typename ReduceOp, typename JoinOp, typename Scalar, typename T1,
template <typename ReduceOp, typename JoinOp, typename Identity, typename T1,
typename... Ts,
typename std::enable_if<
is_tensor<T1, Ts...>::value &&
!is_contiguous_tensor<T1, Ts...>::value>::type* = nullptr>
Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
const Scalar identity, const T1& tensor1,
const Ts&... tensors) {
auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
const Identity& identity, const T1& tensor1,
const Ts&... tensors) {
TA_ASSERT(!empty(tensor1, tensors...));
TA_ASSERT(is_range_set_congruent(tensor1, tensors...));

const auto stride = inner_size(tensor1, tensors...);
const auto volume = tensor1.range().volume();

Scalar result = identity;
auto result = identity;
for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume;
ord += stride) {
Scalar temp = identity;
auto temp = identity;
math::reduce_op(reduce_op, join_op, identity, stride, temp,
tensor1.data() + tensor1.range().ordinal(ord),
(tensors.data() + tensors.range().ordinal(ord))...);
Expand Down
42 changes: 22 additions & 20 deletions src/TiledArray/tensor/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -2174,18 +2174,19 @@ class Tensor {
/// identity . If HAVE_INTEL_TBB is defined, and this is a contiguous tensor,
/// the reduction will be executed in an undefined order, otherwise will
/// execute in the order of increasing \c i .
/// \tparam ReduceOp The reduction
/// operation type
/// \tparam ReduceOp The reduction operation type
/// \tparam JoinOp The join operation type
/// \param reduce_op The
/// element-wise reduction operation
/// \tparam T a type that can be used as argument to ReduceOp
/// \param reduce_op The element-wise reduction operation
/// \param join_op The join result operation
/// \param identity The identity value of the reduction
/// \return The reduced value
template <typename ReduceOp, typename JoinOp, typename Scalar>
decltype(auto) reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
Scalar identity) const {
return detail::tensor_reduce(reduce_op, join_op, identity, *this);
template <typename ReduceOp, typename JoinOp, typename Identity>
auto reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
Identity&& identity) const {
return detail::tensor_reduce(std::forward<ReduceOp>(reduce_op),
std::forward<JoinOp>(join_op),
std::forward<Identity>(identity), *this);
}

/// Binary reduction operation
Expand All @@ -2196,22 +2197,23 @@ class Tensor {
/// \c identity . If HAVE_INTEL_TBB is defined, and this is a contiguous
/// tensor, the reduction will be executed in an undefined order, otherwise
/// will execute in the order of increasing \c i .
/// \tparam Right The
/// right-hand argument tensor type
/// \tparam ReduceOp The reduction operation
/// type
/// \tparam Right The right-hand argument tensor type
/// \tparam ReduceOp The reduction operation type
/// \tparam JoinOp The join operation type
/// \param other The right-hand
/// argument of the binary reduction
/// \param reduce_op The element-wise
/// reduction operation \param join_op The join result operation
/// \tparam Identity A type that can be used as argument to ReduceOp
/// \param other The right-hand argument of the binary reduction
/// \param reduce_op The element-wise reduction operation
/// \param join_op The join result operation
/// \param identity The identity value of the reduction
/// \return The reduced value
template <typename Right, typename ReduceOp, typename JoinOp, typename Scalar,
template <typename Right, typename ReduceOp, typename JoinOp,
typename Identity,
typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
decltype(auto) reduce(const Right& other, ReduceOp&& reduce_op,
JoinOp&& join_op, Scalar identity) const {
return detail::tensor_reduce(reduce_op, join_op, identity, *this, other);
auto reduce(const Right& other, ReduceOp&& reduce_op, JoinOp&& join_op,
Identity&& identity) const {
return detail::tensor_reduce(
std::forward<ReduceOp>(reduce_op), std::forward<JoinOp>(join_op),
std::forward<Identity>(identity), *this, other);
}

/// Sum of elements
Expand Down
Loading

0 comments on commit a5a225b

Please sign in to comment.