Skip to content

Commit

Permalink
Merge pull request #462 from ValeevGroup/evaleev/fixup/gemm-benchmark…
Browse files Browse the repository at this point in the history
…s-tiling

gemm examples support nonuniform tiling
  • Loading branch information
evaleev authored Jul 23, 2024
2 parents 253d59c + 9d5ad23 commit cc0cd50
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 83 deletions.
4 changes: 2 additions & 2 deletions examples/device/ta_cc_abcd_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,8 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
const double flops_per_fma =
(complex_T ? 8 : 2); // 1 multiply takes 6/1 flops for complex/real
// 1 add takes 2/1 flops for complex/real
const double n_gflop = flops_per_fma * std::pow(n_occ, 2) *
std::pow(n_uocc, 4) / std::pow(1024., 3);
const double n_gflop =
flops_per_fma * std::pow(n_occ, 2) * std::pow(n_uocc, 4) / 1e9;

using deviceTile =
btas::Tensor<T, TA::Range, TiledArray::device_um_btas_varray<T>>;
Expand Down
15 changes: 3 additions & 12 deletions examples/device/ta_dense_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,6 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
using RT = TiledArray::detail::scalar_t<Storage>;
constexpr auto complex_T = TiledArray::detail::is_complex_v<T>;

const std::size_t Tm = Nm / Bm;
const std::size_t Tn = Nn / Bn;
const std::size_t Tk = Nk / Bk;

const std::int64_t nflops =
(complex_T ? 8 : 2) // 1 multiply takes 6/1 flops for complex/real
// 1 add takes 2/1 flops for complex/real
Expand All @@ -64,16 +60,16 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,

// Construct TiledRange
std::vector<unsigned int> blocking_m;
blocking_m.reserve(Tm + 1);
for (long i = 0l; i <= Nm; i += Bm) blocking_m.push_back(i);
const std::size_t Tm = blocking_m.size() - 1;

std::vector<unsigned int> blocking_n;
blocking_n.reserve(Tn + 1);
for (long i = 0l; i <= Nn; i += Bn) blocking_n.push_back(i);
const std::size_t Tn = blocking_n.size() - 1;

std::vector<unsigned int> blocking_k;
blocking_k.reserve(Tk + 1);
for (long i = 0l; i <= Nk; i += Bk) blocking_k.push_back(i);
const std::size_t Tk = blocking_k.size();

// Structure of c
std::vector<TiledArray::TiledRange1> blocking_C;
Expand Down Expand Up @@ -255,11 +251,6 @@ int try_main(int argc, char **argv) {
std::cerr << "Error: block sizes must be greater than zero.\n";
return 1;
}
if ((Nm % Bm) != 0ul || Nn % Bn != 0ul || Nk % Bk != 0ul) {
std::cerr
<< "Error: dimension size must be evenly divisible by block size.\n";
return 1;
}
const long nrepeat = (argc >= 8 ? atol(argv[7]) : 5);
if (nrepeat <= 0) {
std::cerr << "Error: number of repetitions must be greater than zero.\n";
Expand Down
144 changes: 91 additions & 53 deletions examples/gemm/ta_cc_abcd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,30 @@ bool to_bool(const char* str) {
// if n = average tile size
// this will produce tiles of these sizes: n+1, n-1, n+2, n-2, etc.
// the last tile absorbs the remainder
std::vector<unsigned int> make_tiling(unsigned int range_size,
unsigned int ntiles) {
const int average_tile_size = range_size / ntiles;
std::vector<unsigned int> result(ntiles + 1);
result[0] = 0;
for (long t = 0; t != ntiles - 1; ++t) {
result[t + 1] = result[t] + average_tile_size +
std::max(static_cast<int>((t % 2 == 0) ? (t + 1) : (-t)),
1 - average_tile_size);
std::vector<unsigned int> make_nonuniform_tiling(unsigned int range_size,
int tile_size) {
std::vector<unsigned int> result;
result.push_back(0);
for (long t = 0; true; ++t) {
unsigned int next_tile_boundary =
result.back() + tile_size +
std::max(static_cast<int>((t % 2 == 0) ? (t + 1) : (-t)),
1 - tile_size);
if (next_tile_boundary >= range_size) break;
result.push_back(next_tile_boundary);
}
result[ntiles] = range_size;
if (result.back() != range_size) result.push_back(range_size);
return result;
}

// makes tiles as uniform as possible
std::vector<unsigned int> make_uniform_tiling(unsigned int range_size,
int tile_size) {
std::vector<unsigned int> result;
for (unsigned int t = 0; t <= range_size; t += tile_size) {
result.push_back(t);
}
if (result.back() != range_size) result.push_back(range_size);
return result;
}

Expand Down Expand Up @@ -72,70 +85,93 @@ int main(int argc, char** argv) {

// Get command line arguments
if (argc < 5) {
std::cout << "Mocks t2(i,a,j,b) * v(a,b,c,d) term in CC amplitude eqs"
<< std::endl
<< "Usage: " << argv[0]
<< " occ_size occ_nblocks uocc_size "
"uocc_nblocks [repetitions] [use_complex]"
<< std::endl;
std::cout
<< "Mocks t2(i,j,a,b) * v(a,b,c,d) term in CC amplitude eqs"
<< std::endl
<< "Usage: " << argv[0]
<< " occ_size occ_tilesize uocc_size "
"uocc_tilesize [repetitions] [scalar=double] [uniform_tiling=1]"
<< std::endl;
return 0;
}
const long n_occ = atol(argv[1]);
const long nblk_occ = atol(argv[2]);
const long b_occ = atol(argv[2]);
const long n_uocc = atol(argv[3]);
const long nblk_uocc = atol(argv[4]);
const long b_uocc = atol(argv[4]);
if (n_occ <= 0) {
std::cerr << "Error: occ_size must be greater than zero.\n";
return 1;
}
if (nblk_occ <= 0) {
std::cerr << "Error: occ_nblocks must be greater than zero.\n";
if (b_occ <= 0) {
std::cerr << "Error: occ_tilesize must be greater than zero.\n";
return 1;
}
if (n_uocc <= 0) {
std::cerr << "Error: uocc_size must be greater than zero.\n";
return 1;
}
if (nblk_uocc <= 0) {
std::cerr << "Error: uocc_nblocks must be greater than zero.\n";
return 1;
}
if ((n_occ < nblk_occ) != 0ul) {
std::cerr << "Error: occ_size must be greater than occ_nblocks.\n";
return 1;
}
if ((n_uocc < nblk_uocc) != 0ul) {
std::cerr << "Error: uocc_size must be greater than uocc_nblocks.\n";
if (b_uocc <= 0) {
std::cerr << "Error: uocc_tilesize must be greater than zero.\n";
return 1;
}
const long repeat = (argc >= 6 ? atol(argv[5]) : 5);
if (repeat <= 0) {
std::cerr << "Error: number of repetitions must be greater than zero.\n";
return 1;
}
const bool use_complex = (argc >= 7 ? to_bool(argv[6]) : false);

const std::string scalar_type_str = (argc >= 7 ? argv[6] : "double");
if (scalar_type_str != "double" && scalar_type_str != "float" &&
scalar_type_str != "zdouble" && scalar_type_str != "zfloat") {
std::cerr << "Error: invalid real type " << scalar_type_str << ".\n";
std::cerr << " valid real types are \"double\", \"float\", "
"\"zdouble\", and \"zfloat\".\n";
return 1;
}

const bool uniform_tiling = (argc >= 8 ? std::atol(argv[7]) : true);

if (world.rank() == 0)
std::cout << "TiledArray: CC T2.V term test..."
<< "\nGit description: " << TiledArray::git_description()
<< "\nNumber of nodes = " << world.size()
<< "\nocc size = " << n_occ
<< "\nocc nblocks = " << nblk_occ
<< "\nocc tilesize = " << b_occ
<< "\nuocc size = " << n_uocc
<< "\nuocc nblocks = " << nblk_uocc
<< "\nComplex = "
<< (use_complex ? "true" : "false") << "\n";
<< "\nuocc tilesize = " << b_uocc
<< "\nscalar type = " << scalar_type_str
<< "\nuniform tiling = "
<< (uniform_tiling ? "true" : "false") << std::endl;

// Construct TiledRange1's
std::vector<unsigned int> tiling_occ = make_tiling(n_occ, nblk_occ);
std::vector<unsigned int> tiling_uocc = make_tiling(n_uocc, nblk_uocc);
std::vector<unsigned int> tiling_occ =
uniform_tiling ? make_uniform_tiling(n_occ, b_occ)
: make_nonuniform_tiling(n_occ, b_occ);
std::vector<unsigned int> tiling_uocc =
uniform_tiling ? make_uniform_tiling(n_uocc, b_uocc)
: make_nonuniform_tiling(n_uocc, b_uocc);
auto trange_occ = TA::TiledRange1(tiling_occ.begin(), tiling_occ.end());
auto trange_uocc = TA::TiledRange1(tiling_uocc.begin(), tiling_uocc.end());

if (use_complex)
cc_abcd<std::complex<double>>(world, trange_occ, trange_uocc, repeat);
else
auto print_tile_sizes = [](const auto& tiling) {
auto b = tiling.begin();
for (auto current = b + 1; current != tiling.end(); ++current) {
std::cout << *current - *(current - 1) << " ";
}
std::cout << std::endl;
};
std::cout << " occ tile sizes: ";
print_tile_sizes(tiling_occ);
std::cout << "uocc tile sizes: ";
print_tile_sizes(tiling_uocc);

if (scalar_type_str == "double")
cc_abcd<double>(world, trange_occ, trange_uocc, repeat);
else if (scalar_type_str == "zdouble")
cc_abcd<std::complex<double>>(world, trange_occ, trange_uocc, repeat);
else if (scalar_type_str == "float")
cc_abcd<float>(world, trange_occ, trange_uocc, repeat);
else if (scalar_type_str == "zfloat")
cc_abcd<std::complex<float>>(world, trange_occ, trange_uocc, repeat);

TA::finalize();

Expand Down Expand Up @@ -175,13 +211,13 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
const double flops_per_fma =
(complex_T ? 8 : 2); // 1 multiply takes 6/1 flops for complex/real
// 1 add takes 2/1 flops for complex/real
const double gflops_per_call = flops_per_fma * std::pow(n_occ, 2) *
std::pow(n_uocc, 4) / std::pow(1024., 3);
const double gflops_per_call =
flops_per_fma * std::pow(n_occ, 2) * std::pow(n_uocc, 4) / 1e9;

// Construct tensors
TA::TArrayD t2(world, trange_oovv);
TA::TArrayD v(world, trange_vvvv);
TA::TArrayD t2_v;
TA::TSpArray<T> t2(world, trange_oovv);
TA::TSpArray<T> v(world, trange_vvvv);
TA::TSpArray<T> t2_v;
// To validate, fill input tensors with random data, otherwise just with 1s
if (do_validate) {
rand_fill_array(t2);
Expand All @@ -201,25 +237,26 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
for (int i = 0; i < repeat; ++i) {
auto tp_start = TiledArray::now();
// this is how the user would express this contraction
if (false) t2_v("i,j,a,b") = t2("i,j,c,d") * v("a,b,c,d");
if (true) t2_v("i,j,a,b") = t2("i,j,c,d") * v("a,b,c,d");

// this demonstrates to the PaRSEC team what happens under the hood of the
// expression above
if (true) {
if (false) {
tensor_contract_444(t2_v, t2, v);

// to validate replace: false -> true
if (do_validate) {
// obtain reference result using the high-level DSL
TA::TArrayD t2_v_ref;
TA::TSpArray<T> t2_v_ref;
t2_v_ref("i,j,a,b") = t2("i,j,c,d") * v("c,d,a,b");
TA::TArrayD error;
TA::TSpArray<T> error;
error("i,j,a,b") = t2_v_ref("i,j,a,b") - t2_v("i,j,a,b");
std::cout << "Validating the result (ignore the timings/performance!): "
"||ref_result - result||_2^2 = "
<< error("i,j,a,b").squared_norm().get() << std::endl;
}
}
t2_v.world().gop.fence();
TiledArray::record_duration_since(tp_start);

const double time = TiledArray::durations().back();
Expand Down Expand Up @@ -357,6 +394,7 @@ template <typename Tile, typename Policy>
void tensor_contract_444(TA::DistArray<Tile, Policy>& tv,
const TA::DistArray<Tile, Policy>& t,
const TA::DistArray<Tile, Policy>& v) {
using Shape = typename Policy::shape_type;
// for convenience, obtain the tiled ranges for the two kinds of dimensions
// used to define t, v, and tv
auto trange_occ = t.trange().dim(0); // the first dimension of t is occ
Expand All @@ -378,10 +416,10 @@ void tensor_contract_444(TA::DistArray<Tile, Policy>& tv,
auto ncols = n_uocc * n_uocc;
TA::detail::ProcGrid proc_grid(world, nrowtiles, ncoltiles, nrows, ncols);
std::shared_ptr<TA::Pmap> pmap;
auto t_eval = make_array_eval(t, t.world(), TA::DenseShape(),
auto t_eval = make_array_eval(t, t.world(), Shape(),
proc_grid.make_row_phase_pmap(ninttiles),
TA::Permutation(), make_array_noop<Tile>());
auto v_eval = make_array_eval(v, v.world(), TA::DenseShape(),
auto v_eval = make_array_eval(v, v.world(), Shape(),
proc_grid.make_col_phase_pmap(ninttiles),
TA::Permutation(), make_array_noop<Tile>());

Expand All @@ -400,7 +438,7 @@ void tensor_contract_444(TA::DistArray<Tile, Policy>& tv,
// 2. there will be a dummy output ArrayEval, its Futures will be set by the
// PTG
auto contract =
make_contract_eval(t_eval, v_eval, world, TA::DenseShape(), pmap,
make_contract_eval(t_eval, v_eval, world, Shape(), pmap,
TA::Permutation(), make_contract<Tile>(4u, 4u, 4u));

// eval() just schedules the Summa task and proceeds
Expand Down
28 changes: 12 additions & 16 deletions examples/gemm/ta_dense_asymm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,6 @@ int main(int argc, char** argv) {
std::cerr << "Error: block sizes must be greater than zero.\n";
return 1;
}
if ((Nm % Bm) != 0ul || Nn % Bn != 0ul || Nk % Bk != 0ul) {
std::cerr
<< "Error: dimension size must be evenly divisible by block size.\n";
return 1;
}
const long repeat = (argc >= 8 ? atol(argv[7]) : 5);
if (repeat <= 0) {
std::cerr << "Error: number of repetitions must be greater than zero.\n";
Expand All @@ -72,22 +67,22 @@ int main(int argc, char** argv) {

const bool do_memtrace = (argc >= 10 ? std::atol(argv[9]) : false);

const std::size_t Tm = Nm / Bm;
const std::size_t Tn = Nn / Bn;
const std::size_t Tk = Nk / Bk;

// Construct TiledRange
std::vector<unsigned int> blocking_m;
blocking_m.reserve(Tm + 1);
for (long i = 0l; i <= Nm; i += Bm) blocking_m.push_back(i);
if (blocking_m.back() != Nm) blocking_m.push_back(Nm);

std::vector<unsigned int> blocking_n;
blocking_n.reserve(Tn + 1);
for (long i = 0l; i <= Nn; i += Bn) blocking_n.push_back(i);
if (blocking_n.back() != Nn) blocking_n.push_back(Nn);

std::vector<unsigned int> blocking_k;
blocking_k.reserve(Tk + 1);
for (long i = 0l; i <= Nk; i += Bk) blocking_k.push_back(i);
if (blocking_k.back() != Nk) blocking_k.push_back(Nk);

const std::size_t Tm = blocking_m.size() - 1;
const std::size_t Tn = blocking_n.size() - 1;
const std::size_t Tk = blocking_k.size() - 1;

// Structure of c
std::vector<TiledArray::TiledRange1> blocking_C;
Expand Down Expand Up @@ -138,13 +133,13 @@ int main(int argc, char** argv) {
<< "\nScalar type = " << scalar_type_str
<< "\nSize of A = " << Nm << "x" << Nk << " ("
<< double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)"
<< "\nSize of A block = " << Bm << "x" << Bk
<< "\nSize of (largest) A block = " << Bm << "x" << Bk
<< "\nSize of B = " << Nk << "x" << Nn << " ("
<< double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)"
<< "\nSize of B block = " << Bk << "x" << Bn
<< "\nSize of (largest) B block = " << Bk << "x" << Bn
<< "\nSize of C = " << Nm << "x" << Nn << " ("
<< double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)"
<< "\nSize of C block = " << Bm << "x" << Bn
<< "\nSize of (largest) C block = " << Bm << "x" << Bn
<< "\n# of blocks of C = " << Tm * Tn
<< "\nAverage # of blocks of C/node = "
<< double(Tm * Tn) / double(world.size()) << "\n";
Expand All @@ -153,10 +148,11 @@ int main(int argc, char** argv) {
if (do_memtrace) {
world.gop.fence();
madness::print_meminfo(world.rank(), str);
} else {
world.gop.fence();
}
#ifdef TA_TENSOR_MEM_PROFILE
{
world.gop.fence();
std::cout
<< str << ": TA::Tensor allocated "
<< TA::hostEnv::instance()->host_allocator_getActualHighWatermark()
Expand Down

0 comments on commit cc0cd50

Please sign in to comment.