Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Towards unified device & host tasks #299

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ if (TARGET tiledarray)
COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2)

add_ttg_executable(testing_dpotrf potrf/testing_dpotrf.cc LINK_LIBRARIES tiledarray lapackpp)
add_ttg_executable(testing_dpotrf_host potrf/testing_dpotrf.cc
LINK_LIBRARIES tiledarray lapackpp
COMPILE_DEFINITIONS TTG_ENABLE_DEV_HOST=1)
add_ttg_executable(testing_dtrtri potrf/testing_dtrtri.cc LINK_LIBRARIES tiledarray lapackpp)
add_ttg_executable(testing_dlauum potrf/testing_dlauum.cc LINK_LIBRARIES tiledarray lapackpp)
add_ttg_executable(testing_dpoinv potrf/testing_dpoinv.cc LINK_LIBRARIES tiledarray lapackpp)
Expand Down Expand Up @@ -50,14 +53,27 @@ if (TARGET tiledarray)
endif()

if (TTG_HAVE_CUDA)
add_ttg_executable(chain-ttg-cuda task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec")
add_ttg_executable(chain-ttg-dev-cuda task-benchmarks/chain-ttg-dev.cc
COMPILE_DEFINITIONS CHAIN_CUDA=1
LINK_LIBRARIES tiledarray
RUNTIMES "parsec")
endif(TTG_HAVE_CUDA)

if (TTG_HAVE_HIP)
add_ttg_executable(chain-ttg-hip task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec")
add_ttg_executable(chain-ttg-dev-hip task-benchmarks/chain-ttg-dev.cc
COMPILE_DEFINITIONS CHAIN_HIP=1
LINK_LIBRARIES tiledarray
RUNTIMES "parsec")
endif(TTG_HAVE_HIP)
endif()

add_ttg_executable(chain-ttg-host task-benchmarks/chain-ttg.cc)

add_ttg_executable(chain-ttg-dev-host task-benchmarks/chain-ttg-dev.cc
COMPILE_DEFINITIONS CHAIN_HOST=1
LINK_LIBRARIES tiledarray
RUNTIMES "parsec")

if (TARGET MADworld)
add_ttg_executable(madness-1d madness/madness-1d/madness-1d.cc RUNTIMES "mad")
if (TARGET blaspp) #(CBLAS_FOUND AND MKL_FOUND)
Expand Down
8 changes: 5 additions & 3 deletions examples/matrixtile.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@

#include <TiledArray/device/allocators.h>
#if defined(TILEDARRAY_HAS_DEVICE)
#define ALLOCATOR TiledArray::device_pinned_allocator<T>
template<typename T>
using default_allocator_t = TiledArray::device_pinned_allocator<T>;

inline void allocator_init(int argc, char **argv) {
// initialize MADNESS so that TA allocators can be created
Expand All @@ -26,15 +27,16 @@ inline void allocator_fini() {
madness::finalize();
}
#else // TILEDARRAY_HAS_DEVICE
#define ALLOCATOR std::allocator<T>
template<typename T>
using default_allocator_t = std::allocator<T>;

inline void allocator_init(int argc, char **argv) { }

inline void allocator_fini() { }

#endif // TILEDARRAY_HAS_DEVICE

template <typename T, class Allocator = ALLOCATOR>
template <typename T, class Allocator = default_allocator_t<T>>
class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
public:
using metadata_t = typename std::tuple<std::size_t, std::size_t, std::size_t>;
Expand Down
128 changes: 63 additions & 65 deletions examples/potrf/potrf.h

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion examples/spmm/spmm_cuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -819,7 +819,7 @@ class SpMM25D {
}
}

ttg::device::Task op(const Key<3> &ijk, typename baseT::input_refs_tuple_type &&_ijk,
ttg::CoTask op(const Key<3> &ijk, typename baseT::input_refs_tuple_type &&_ijk,
std::tuple<Out<Key<2>, Blk>, Out<Key<3>, Blk>> &result) {
const auto i = ijk[0];
const auto j = ijk[1];
Expand Down
25 changes: 15 additions & 10 deletions examples/task-benchmarks/chain-ttg-dev.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,19 @@

#include "chrono.h"

#if defined(TTG_HAVE_CUDA)
#if defined(CHAIN_CUDA)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the difference between CHAIN_CUDA/HIP and ENABLE_CUDA/HIP?

#ifndef TTG_HAVE_CUDA
#error Cannot build CUDA chain benchmark against TTG that does not support CUDA!
#endif
#define ES ttg::ExecutionSpace::CUDA
#elif defined(TTG_HAVE_HIP)
#elif defined(CHAIN_HIP)
#define ES ttg::ExecutionSpace::HIP
#ifndef TTG_HAVE_HIP
#error Cannot build HIP chain benchmark against TTG that does not support HIP!
#endif
#else
#error "Either CUDA OR HIP is required to build this test!"
#endif // 0
#define ES ttg::ExecutionSpace::Host
#endif

#define NUM_TASKS 100000

Expand Down Expand Up @@ -53,7 +59,7 @@ auto make_ttg<1>(bool do_move) {
send<0>(0, A{});
}, edges(), edges(I2N));

auto next = make_tt<ES, int>([=](const int &key, auto&& value) -> ttg::device::Task {
auto next = make_tt<int>([=](const int &key, auto&& value) -> ttg::CoTask<ES> {
//++task_counter;
co_await ttg::device::select(value.b);
if (key < NUM_TASKS) {
Expand All @@ -62,7 +68,6 @@ auto make_ttg<1>(bool do_move) {
} else {
co_await ttg::device::forward(ttg::device::send<0>(key+1, value));
}
} else {
}
} , edges(fuse(I2N, N2N)), edges(N2N));

Expand All @@ -80,7 +85,7 @@ auto make_ttg<2>(bool do_move) {
send<1>(0, A{});
}, edges(), edges(I2N1, I2N2));

auto next = make_tt<ES, int>([=](const int &key, A&& v1, A&& v2) -> ttg::device::Task {
auto next = make_tt<int>([=](const int &key, A&& v1, A&& v2) -> ttg::CoTask<ES> {
co_await ttg::device::select(v1.b, v2.b);
if (key < NUM_TASKS) {
if (do_move) {
Expand Down Expand Up @@ -110,7 +115,7 @@ auto make_ttg<4>(bool do_move) {
send<3>(0, A{});
}, edges(), edges(I2N1, I2N2, I2N3, I2N4));

auto next = make_tt<ES, int>([=](const int &key, A&& v1, A&& v2, A&& v3, A&& v4) -> ttg::device::Task {
auto next = make_tt<int>([=](const int &key, A&& v1, A&& v2, A&& v3, A&& v4) -> ttg::CoTask<ES> {
co_await ttg::device::select(v1.b, v2.b, v3.b, v4.b);
if (key < NUM_TASKS) {
if (do_move) {
Expand Down Expand Up @@ -150,7 +155,7 @@ auto make_ttg<8>(bool do_move) {
send<7>(0, A{});
}, edges(), edges(I2N1, I2N2, I2N3, I2N4, I2N5, I2N6, I2N7, I2N8));

auto next = make_tt<ES, int>([=](const int &key, auto&& v1, auto&& v2, auto&& v3, auto&& v4, auto&& v5, auto&& v6, auto&& v7, auto&& v8) -> ttg::device::Task {
auto next = make_tt<int>([=](const int &key, auto&& v1, auto&& v2, auto&& v3, auto&& v4, auto&& v5, auto&& v6, auto&& v7, auto&& v8) -> ttg::CoTask<ES> {
co_await ttg::device::select(v1.b, v2.b, v3.b, v4.b, v5.b, v6.b, v7.b, v8.b);
if (key < NUM_TASKS) {
if (do_move) {
Expand Down Expand Up @@ -187,7 +192,7 @@ auto make_ttg<0>(bool do_move) {

auto init = make_tt<void>([](std::tuple<Out<int, void>> &outs) { sendk<0>(0, outs); }, edges(), edges(I2N));

auto next = make_tt<ES>([](const int& key) -> ttg::device::Task {
auto next = make_tt([](const int& key) -> ttg::CoTask<ES> {
co_await ttg::device::select();
if (key < NUM_TASKS) {
co_await ttg::device::forward(ttg::device::sendk<0>(key+1));
Expand Down
Loading