Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

complex ta dense asymm #397

Merged
merged 2 commits into from
Mar 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 99 additions & 34 deletions examples/dgemm/ta_dense_asymm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ int main(int argc, char** argv) {
"blocked by Bm, Bn, and Bk, respectively"
<< std::endl
<< "Usage: " << argv[0]
<< " Nm Bm Nn Bn Nk Bk [repetitions=5] [real=double]\n";
<< " Nm Bm Nn Bn Nk Bk [repetitions=5] [scalar=double] "
"[do_memtrace=0]\n";
return 0;
}
const long Nm = atol(argv[1]);
Expand Down Expand Up @@ -59,19 +60,25 @@ int main(int argc, char** argv) {
return 1;
}

const std::string real_type_str = (argc >= 9 ? argv[8] : "double");
if (real_type_str != "double" && real_type_str != "float") {
std::cerr << "Error: invalid real type " << real_type_str << ".\n";
const std::string scalar_type_str = (argc >= 9 ? argv[8] : "double");
if (scalar_type_str != "double" && scalar_type_str != "float" &&
scalar_type_str != "zdouble" && scalar_type_str != "zfloat") {
std::cerr << "Error: invalid real type " << scalar_type_str << ".\n";
std::cerr << " valid real types are \"double\", \"float\", "
"\"zdouble\", and \"zfloat\".\n";
return 1;
}

const bool do_memtrace = (argc >= 10 ? std::atol(argv[9]) : false);

const std::size_t Tm = Nm / Bm;
const std::size_t Tn = Nn / Bn;
const std::size_t Tk = Nk / Bk;

if (world.rank() == 0)
std::cout << "TiledArray: dense matrix multiply test...\n"
<< "Number of nodes = " << world.size()
<< "\nScalar type = " << scalar_type_str
<< "\nSize of A = " << Nm << "x" << Nk << " ("
<< double(Nm * Nk * sizeof(double)) / 1.0e9 << " GB)"
<< "\nSize of A block = " << Bm << "x" << Bk
Expand Down Expand Up @@ -133,54 +140,112 @@ int main(int argc, char** argv) {

auto run = [&](auto* tarray_ptr) {
using Array = std::decay_t<std::remove_pointer_t<decltype(tarray_ptr)>>;

// Construct and initialize arrays
Array a(world, trange_a);
Array b(world, trange_b);
Array c(world, trange_c);
a.fill(1.0);
b.fill(1.0);

// Start clock
world.gop.fence();
const double wall_time_start = madness::wall_time();

// Do matrix multiplication
for (int i = 0; i < repeat; ++i) {
c("m,n") = a("m,k") * b("k,n");
using scalar_type = TiledArray::detail::scalar_t<Array>;

const auto complex_T = TiledArray::detail::is_complex_v<scalar_type>;
const std::int64_t nflops =
(complex_T ? 8 : 2) // 1 multiply takes 6/1 flops for complex/real
// 1 add takes 2/1 flops for complex/real
* static_cast<std::int64_t>(Nn) * static_cast<std::int64_t>(Nm) *
static_cast<std::int64_t>(Nk);

auto memtrace = [do_memtrace, &world](const std::string& str) -> void {
if (do_memtrace) {
world.gop.fence();
madness::print_meminfo(world.rank(), str);
}
#ifdef TA_TENSOR_MEM_PROFILE
{
world.gop.fence();
std::cout
<< str << ": TA::Tensor allocated "
<< TA::hostEnv::instance()->host_allocator_getActualHighWatermark()
<< " bytes and used "
<< TA::hostEnv::instance()->host_allocator().getHighWatermark()
<< " bytes" << std::endl;
}
#endif
};

memtrace("start");
{ // array lifetime scope
// Construct and initialize arrays
Array a(world, trange_a);
Array b(world, trange_b);
Array c(world, trange_c);
a.fill(1.0);
b.fill(1.0);
memtrace("allocated a and b");

// Start clock
world.gop.fence();
if (world.rank() == 0) std::cout << "Iteration " << i + 1 << "\n";
}

// Stop clock
const double wall_time_stop = madness::wall_time();

if (world.rank() == 0)
std::cout << "Average wall time = "
<< (wall_time_stop - wall_time_start) / double(repeat)
<< " sec\nAverage GFLOPS = "
<< double(repeat) * 2.0 * double(Nn * Nm * Nk) /
(wall_time_stop - wall_time_start) / 1.0e9
<< "\n";
if (world.rank() == 0)
std::cout << "Starting iterations: "
<< "\n";

double total_time = 0.0;
double total_gflop_rate = 0.0;

// Do matrix multiplication
for (int i = 0; i < repeat; ++i) {
const double start = madness::wall_time();
c("m,n") = a("m,k") * b("k,n");
memtrace("c=a*b");
const double time = madness::wall_time() - start;
total_time += time;
const double gflop_rate = double(nflops) / (time * 1.e9);
total_gflop_rate += gflop_rate;
if (world.rank() == 0)
std::cout << "Iteration " << i + 1 << " time=" << time
<< " GFLOPS=" << gflop_rate << "\n";
}

// Stop clock
const double wall_time_stop = madness::wall_time();

if (world.rank() == 0) {
std::cout << "Average wall time = " << total_time / double(repeat)
<< " sec\nAverage GFLOPS = "
<< total_gflop_rate / double(repeat) << "\n";
}

} // array lifetime scope
memtrace("stop");
};

// by default use TiledArray tensors
constexpr bool use_btas = false;
// btas::Tensor instead
if (real_type_str == "double") {
if (scalar_type_str == "double") {
if constexpr (!use_btas)
run(static_cast<TiledArray::TArrayD*>(nullptr));
else
run(static_cast<TiledArray::DistArray<
TiledArray::Tile<btas::Tensor<double, TiledArray::Range>>>*>(
nullptr));
} else {
} else if (scalar_type_str == "float") {
if constexpr (!use_btas)
run(static_cast<TiledArray::TArrayF*>(nullptr));
else
run(static_cast<TiledArray::DistArray<
TiledArray::Tile<btas::Tensor<float, TiledArray::Range>>>*>(
nullptr));
} else if (scalar_type_str == "zdouble") {
if constexpr (!use_btas)
run(static_cast<TiledArray::TArrayZ*>(nullptr));
else
run(static_cast<TiledArray::DistArray<TiledArray::Tile<
btas::Tensor<std::complex<double>, TiledArray::Range>>>*>(
nullptr));
} else if (scalar_type_str == "zfloat") {
if constexpr (!use_btas)
run(static_cast<TiledArray::TArrayC*>(nullptr));
else
run(static_cast<TiledArray::DistArray<TiledArray::Tile<
btas::Tensor<std::complex<float>, TiledArray::Range>>>*>(
nullptr));
} else {
abort(); // unreachable
}

return 0;
Expand Down
43 changes: 22 additions & 21 deletions src/TiledArray/tensor/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -646,34 +646,34 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1,
/// \tparam ReduceOp The element-wise reduction
/// operation type
/// \tparam JoinOp The result operation type
/// \tparam Scalar A
/// scalar type
/// \tparam Identity A type that can be used as an argument to ReduceOp
/// \tparam T1 The first argument tensor type
/// \tparam Ts The
/// argument tensor types
/// \tparam Ts The argument tensor types
/// \param reduce_op The element-wise reduction operation
/// \param identity The initial value for the reduction and the result
/// \param tensor1 The first tensor to be reduced
/// \param tensors The other tensors to be reduced
/// \return The reduced value of the tensor(s)
template <
typename ReduceOp, typename JoinOp, typename Scalar, typename T1,
typename ReduceOp, typename JoinOp, typename Identity, typename T1,
typename... Ts,
typename std::enable_if_t<
is_tensor<T1, Ts...>::value && is_contiguous_tensor<T1, Ts...>::value &&
!is_reduce_op_v<std::decay_t<ReduceOp>, std::decay_t<Scalar>,
!is_reduce_op_v<std::decay_t<ReduceOp>, std::decay_t<Identity>,
std::decay_t<T1>, std::decay_t<Ts>...>>* = nullptr>
Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
const T1& tensor1, const Ts&... tensors) {
auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Identity&& identity,
const T1& tensor1, const Ts&... tensors) {
TA_ASSERT(!empty(tensor1, tensors...));
TA_ASSERT(is_range_set_congruent(tensor1, tensors...));

const auto volume = tensor1.range().volume();

math::reduce_op(reduce_op, join_op, identity, volume, identity,
auto init = std::forward<Identity>(identity);
math::reduce_op(std::forward<ReduceOp>(reduce_op),
std::forward<JoinOp>(join_op), init, volume, init,
tensor1.data(), tensors.data()...);

return identity;
return init;
}

/// Reduction operation for tensors
Expand All @@ -698,8 +698,8 @@ template <
is_tensor<T1, Ts...>::value && is_contiguous_tensor<T1, Ts...>::value &&
is_reduce_op_v<std::decay_t<ReduceOp>, std::decay_t<Scalar>,
std::decay_t<T1>, std::decay_t<Ts>...>>* = nullptr>
Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
const T1& tensor1, const Ts&... tensors) {
auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
const T1& tensor1, const Ts&... tensors) {
reduce_op(identity, &tensor1, &tensors...);
return identity;
}
Expand All @@ -723,13 +723,14 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
/// \param tensor1 The first tensor to be reduced
/// \param tensors The other tensors to be reduced
/// \return The reduced value of the tensor(s)
template <typename ReduceOp, typename JoinOp, typename Scalar, typename T1,
template <typename ReduceOp, typename JoinOp, typename Identity, typename T1,
typename... Ts,
typename std::enable_if<
is_tensor_of_tensor<T1, Ts...>::value &&
is_contiguous_tensor<T1, Ts...>::value>::type* = nullptr>
Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
const T1& tensor1, const Ts&... tensors) {
auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
const Identity& identity, const T1& tensor1,
const Ts&... tensors) {
TA_ASSERT(!empty(tensor1, tensors...));
TA_ASSERT(is_range_set_congruent(tensor1, tensors...));

Expand Down Expand Up @@ -765,24 +766,24 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
/// \param tensor1 The first tensor to be reduced
/// \param tensors The other tensors to be reduced
/// \return The reduced value of the tensor(s)
template <typename ReduceOp, typename JoinOp, typename Scalar, typename T1,
template <typename ReduceOp, typename JoinOp, typename Identity, typename T1,
typename... Ts,
typename std::enable_if<
is_tensor<T1, Ts...>::value &&
!is_contiguous_tensor<T1, Ts...>::value>::type* = nullptr>
Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
const Scalar identity, const T1& tensor1,
const Ts&... tensors) {
auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
const Identity& identity, const T1& tensor1,
const Ts&... tensors) {
TA_ASSERT(!empty(tensor1, tensors...));
TA_ASSERT(is_range_set_congruent(tensor1, tensors...));

const auto stride = inner_size(tensor1, tensors...);
const auto volume = tensor1.range().volume();

Scalar result = identity;
auto result = identity;
for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume;
ord += stride) {
Scalar temp = identity;
auto temp = identity;
math::reduce_op(reduce_op, join_op, identity, stride, temp,
tensor1.data() + tensor1.range().ordinal(ord),
(tensors.data() + tensors.range().ordinal(ord))...);
Expand Down
42 changes: 22 additions & 20 deletions src/TiledArray/tensor/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -2174,18 +2174,19 @@ class Tensor {
/// identity . If HAVE_INTEL_TBB is defined, and this is a contiguous tensor,
/// the reduction will be executed in an undefined order, otherwise will
/// execute in the order of increasing \c i .
/// \tparam ReduceOp The reduction
/// operation type
/// \tparam ReduceOp The reduction operation type
/// \tparam JoinOp The join operation type
/// \param reduce_op The
/// element-wise reduction operation
/// \tparam T a type that can be used as argument to ReduceOp
/// \param reduce_op The element-wise reduction operation
/// \param join_op The join result operation
/// \param identity The identity value of the reduction
/// \return The reduced value
template <typename ReduceOp, typename JoinOp, typename Scalar>
decltype(auto) reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
Scalar identity) const {
return detail::tensor_reduce(reduce_op, join_op, identity, *this);
template <typename ReduceOp, typename JoinOp, typename Identity>
auto reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
Identity&& identity) const {
return detail::tensor_reduce(std::forward<ReduceOp>(reduce_op),
std::forward<JoinOp>(join_op),
std::forward<Identity>(identity), *this);
}

/// Binary reduction operation
Expand All @@ -2196,22 +2197,23 @@ class Tensor {
/// \c identity . If HAVE_INTEL_TBB is defined, and this is a contiguous
/// tensor, the reduction will be executed in an undefined order, otherwise
/// will execute in the order of increasing \c i .
/// \tparam Right The
/// right-hand argument tensor type
/// \tparam ReduceOp The reduction operation
/// type
/// \tparam Right The right-hand argument tensor type
/// \tparam ReduceOp The reduction operation type
/// \tparam JoinOp The join operation type
/// \param other The right-hand
/// argument of the binary reduction
/// \param reduce_op The element-wise
/// reduction operation \param join_op The join result operation
/// \tparam Identity A type that can be used as argument to ReduceOp
/// \param other The right-hand argument of the binary reduction
/// \param reduce_op The element-wise reduction operation
/// \param join_op The join result operation
/// \param identity The identity value of the reduction
/// \return The reduced value
template <typename Right, typename ReduceOp, typename JoinOp, typename Scalar,
template <typename Right, typename ReduceOp, typename JoinOp,
typename Identity,
typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
decltype(auto) reduce(const Right& other, ReduceOp&& reduce_op,
JoinOp&& join_op, Scalar identity) const {
return detail::tensor_reduce(reduce_op, join_op, identity, *this, other);
auto reduce(const Right& other, ReduceOp&& reduce_op, JoinOp&& join_op,
Identity&& identity) const {
return detail::tensor_reduce(
std::forward<ReduceOp>(reduce_op), std::forward<JoinOp>(join_op),
std::forward<Identity>(identity), *this, other);
}

/// Sum of elements
Expand Down
Loading