Skip to content

Commit

Permalink
Merge pull request #913 from e10harvey/perf_test_fix_device_verify
Browse files Browse the repository at this point in the history
perf_test/blas/blas3: Fix device verify
  • Loading branch information
e10harvey authored Mar 22, 2021
2 parents f7e58cd + cbd421f commit 8facb32
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 89 deletions.
207 changes: 118 additions & 89 deletions perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ struct gemm_args {
typedef struct gemm_args gemm_args_t;

static std::string gemm_csv_header_str =
"algorithm,transAtransB,alpha,beta,team_size,vector_len,loop_type,A_dims,B_"
"algorithm,vector_type,transAtransB,alpha,beta,team_size,vector_len,loop_type,A_dims,B_"
"dims,C_dims,warm_up_n,"
"iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)";

Expand Down Expand Up @@ -249,6 +249,7 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
std::string algo_name = test_e_str[options.test];
std::string ts = std::to_string(gemm_args.bp.team_size);
std::string vlen = std::to_string(gemm_args.bp.vector_len);
std::string vtype = internal_vector_type::label();
if (experiment_name) algo_name = std::string(experiment_name);
if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO";

Expand All @@ -264,7 +265,7 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,

gflops = flops / 1e9;

options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << ","
options.out[0] << algo_name << "," << vtype << "," << options.blas_args.gemm.gemm_args << ","
<< static_cast<double>(options.blas_args.gemm.alpha) << ","
<< static_cast<double>(options.blas_args.gemm.beta) << ","
<< ts << "," << vlen << "," << loop_e_str[options.loop] << ","
Expand Down Expand Up @@ -1314,14 +1315,11 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) {
* @var epsilon: The tolerance to use when comparing.
* @return true if the comparison fails and false if the comparison succeeds.
*/
static inline bool __gemm_print_compare_failure(view_type_3d expected,
view_type_3d actual, int i,
template<class ViewType>
static inline bool __gemm_print_compare_failure(ViewType h_expected,
ViewType h_actual, int i,
int j, int k, double epsilon) {
STATUS;
typename view_type_3d::HostMirror h_expected =
Kokkos::create_mirror_view(expected);
typename view_type_3d::HostMirror h_actual =
Kokkos::create_mirror_view(actual);
auto diff = static_cast<double>(Kokkos::Experimental::fabs(
static_cast<double>(h_expected(i, j, k) - h_actual(i, j, k))));

Expand All @@ -1348,22 +1346,32 @@ static inline bool __gemm_do_compare(view_type_3d expected,
double epsilon = Test::epsilon<ScalarType>::value * 1e3;
STATUS;

typename view_type_3d::HostMirror h_expected =
Kokkos::create_mirror_view(expected);
typename view_type_3d::HostMirror h_actual =
Kokkos::create_mirror_view(actual);

// Copy to host for comparision
Kokkos::deep_copy(h_expected, expected);
Kokkos::deep_copy(h_actual, actual);
Kokkos::fence();

if (std::is_same<LayoutType, Kokkos::LayoutRight>::value) {
for (size_t i = 0; i < expected.extent(0); i++) {
for (size_t j = 0; j < expected.extent(1); j++) {
for (size_t k = 0; k < expected.extent(2); k++) {
if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon))
for (size_t i = 0; i < h_expected.extent(0); i++) {
for (size_t j = 0; j < h_expected.extent(1); j++) {
for (size_t k = 0; k < h_expected.extent(2); k++) {
if (__gemm_print_compare_failure<decltype(h_expected)>(h_expected, h_actual, i, j, k, epsilon))
return true;
}
}
}
}

if (std::is_same<LayoutType, Kokkos::LayoutLeft>::value) {
for (size_t k = 0; k < expected.extent(2); k++) {
for (size_t j = 0; j < expected.extent(1); j++) {
for (size_t i = 0; i < expected.extent(0); i++) {
if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon))
for (size_t k = 0; k < h_expected.extent(2); k++) {
for (size_t j = 0; j < h_expected.extent(1); j++) {
for (size_t i = 0; i < h_expected.extent(0); i++) {
if (__gemm_print_compare_failure<decltype(h_expected)>(h_expected, h_actual, i, j, k, epsilon))
return true;
}
}
Expand All @@ -1379,87 +1387,108 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src,
options_t options) {
using dst_scalar_type = typename dstViewType::value_type;
using src_scalar_type = typename view_type_5d::value_type;
size_t remainder, vector_batch_size, simd_batch_size, last_batch;
bool data_layout_same_as_3d_view = false;
typename dstViewType::HostMirror h_dst =
Kokkos::create_mirror_view(dst);
typename view_type_4d::HostMirror h_src =
Kokkos::create_mirror_view(src.mat_4d);
Kokkos::deep_copy(h_src, src.mat_4d);
Kokkos::fence();

if (options.blas_args.batch_size_last_dim) {
view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(),
simd_internal_vector_size, src.ivec_4d.extent(0),
src.ivec_4d.extent(1), src.ivec_4d.extent(2),
src.ivec_4d.extent(3));
typename view_type_5d::HostMirror h_src_raw =
Kokkos::create_mirror_view(src_raw);
size_t remainder = dst.extent(2) % simd_vector_size;
remainder = remainder == 0 ? simd_internal_vector_size : remainder;

// The below loops copies each corresponding 2-rank matrix within the simd
// view back to the 3-rank view.
for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder;
simd_internal_vec_idx++) {
auto sv0 =
Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(),
Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
for (size_t vector_batch_idx = 0;
vector_batch_idx < src.ivec_4d.extent(0); vector_batch_idx++) {
auto sv1 = Kokkos::subview(sv0, vector_batch_idx, Kokkos::ALL(),
Kokkos::ALL(), Kokkos::ALL());
for (size_t simd_batch_size_idx = 0;
simd_batch_size_idx < src.ivec_4d.extent(3);
simd_batch_size_idx++) {
auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(),
simd_batch_size_idx);
for (size_t m = 0; m < src.ivec_4d.extent(1); m++) {
for (size_t n = 0; n < src.ivec_4d.extent(2); n++) {
dst(m, n,
simd_internal_vec_idx + simd_batch_size_idx +
vector_batch_idx) = sv2(m, n);
}
}
}
}
}
remainder = dst.extent(2) % simd_internal_vector_size;
vector_batch_size = src.ivec_4d.extent(0);
simd_batch_size = src.ivec_4d.extent(3);
last_batch = dst.extent(2);
if (std::is_same<default_layout, Kokkos::LayoutRight>::value && remainder == 0)
data_layout_same_as_3d_view = true;

} else {
view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(),
simd_internal_vector_size, src.ivec_4d.extent(0),
src.ivec_4d.extent(1), src.ivec_4d.extent(2),
src.ivec_4d.extent(3));
typename view_type_5d::HostMirror h_src_raw =
Kokkos::create_mirror_view(src_raw);
size_t remainder = dst.extent(0) % simd_vector_size;

if (remainder > 0) {
// The below loops copies each corresponding 2-rank matrix within the simd
// view back to the 3-rank view.
for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder;
simd_internal_vec_idx++) {
auto sv0 =
Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(),
Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
for (size_t simd_batch_size_idx = 0;
simd_batch_size_idx < src.ivec_4d.extent(0);
simd_batch_size_idx++) {
auto sv1 = Kokkos::subview(sv0, simd_batch_size_idx, Kokkos::ALL(),
Kokkos::ALL(), Kokkos::ALL());
for (size_t vector_batch_idx = 0;
vector_batch_idx < src.ivec_4d.extent(3); vector_batch_idx++) {
auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(),
vector_batch_idx);
for (size_t m = 0; m < src.ivec_4d.extent(1); m++) {
for (size_t n = 0; n < src.ivec_4d.extent(2); n++) {
dst(simd_internal_vec_idx + simd_batch_size_idx +
vector_batch_idx,
m, n) = sv2(m, n);
}
}
remainder = dst.extent(0) % simd_internal_vector_size;
vector_batch_size = src.ivec_4d.extent(3);
simd_batch_size = src.ivec_4d.extent(0);
last_batch = dst.extent(0);
if (std::is_same<default_layout, Kokkos::LayoutLeft>::value && remainder == 0)
data_layout_same_as_3d_view = true;
}

// When the batch_size is a multiple of the simd_vector_size and the batch_size
// dimension is nearest to the simd_vector_size dimension, each 2-rank matrix
// lies in the correct location and the data can simply be cast to the 3d view.
if (data_layout_same_as_3d_view) {
// We can just re-cast the data to the 3d view but we'll copy it for verification
memcpy(h_dst.data(), h_src.data(),
sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) *
dst.extent(2));
Kokkos::deep_copy(dst, h_dst);
Kokkos::fence();
return;
}

// If the remainder is 0, we have simd_vector_size sub-batches to copy out...
// this is a bad data access pattern but for these perf_tests we will support it.
// If the remainder is non-zero, we have simd_vector_size sub-batches + remainder to
// copy out.
remainder += simd_internal_vector_size;

// Views needed for slow manual copy
using h_view_type_5d = Kokkos::View<src_scalar_type *****, default_layout, Kokkos::HostSpace>;
using h_subview_type_2d = Kokkos::View<src_scalar_type **, Kokkos::LayoutStride, Kokkos::HostSpace>;
using h_subview_type_3d = Kokkos::View<src_scalar_type ***, Kokkos::LayoutStride, Kokkos::HostSpace>;
using h_subview_type_4d = Kokkos::View<src_scalar_type ****, Kokkos::LayoutStride, Kokkos::HostSpace>;
h_view_type_5d h_src_raw;
h_subview_type_4d h_sv0;
h_subview_type_3d h_sv1;
h_subview_type_2d h_sv2;

// TODO: Clean everything below this point up...
if (std::is_same<default_layout, Kokkos::LayoutRight>::value)
h_src_raw = h_view_type_5d((src_scalar_type *)h_src.data(), src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3), simd_internal_vector_size);
else
h_src_raw = h_view_type_5d((src_scalar_type *)h_src.data(),
simd_internal_vector_size, src.ivec_4d.extent(0),
src.ivec_4d.extent(1), src.ivec_4d.extent(2),
src.ivec_4d.extent(3));

// The below loops copies each corresponding 2-rank matrix within the simd
// view back to the 3-rank view.
for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder;
simd_internal_vec_idx++) {
if (std::is_same<default_layout, Kokkos::LayoutRight>::value)
h_sv0 = Kokkos::subview(h_src_raw, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), simd_internal_vec_idx);
else
h_sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());

for (size_t vector_batch_idx = 0;
vector_batch_idx < vector_batch_size; vector_batch_idx++) {
if (options.blas_args.batch_size_last_dim)
h_sv1 = Kokkos::subview(h_sv0, vector_batch_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
else
h_sv1 = Kokkos::subview(h_sv0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), vector_batch_idx);
for (size_t simd_batch_size_idx = 0;
simd_batch_size_idx < simd_batch_size;
simd_batch_size_idx++) {
if (options.blas_args.batch_size_last_dim)
h_sv2 = Kokkos::subview(h_sv1, Kokkos::ALL(), Kokkos::ALL(), simd_batch_size_idx);
else
h_sv2 = Kokkos::subview(h_sv1, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL());
for (size_t m = 0; m < src.ivec_4d.extent(1); m++) {
for (size_t n = 0; n < src.ivec_4d.extent(2); n++) {
if (options.blas_args.batch_size_last_dim)
h_dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = h_sv2(m, n);
else
h_dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = h_sv2(m, n);
}
}
if (simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx == last_batch - 1)
goto out;
}
} else {
// When the batch_size is a multiple of the simd_vector_size, each 2-rank
// matrix lies in the correct location and the data can simply be copied.
memcpy(dst.data(), src.ivec_4d.data(),
sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) *
dst.extent(2));
}
}
out:
Kokkos::deep_copy(dst, h_dst);
Kokkos::fence();
}

/**
Expand Down
3 changes: 3 additions & 0 deletions src/batched/KokkosBatched_Vector_SIMD.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,9 @@ namespace KokkosBatched {
enum : int { vector_length = 8 };
typedef __m512d data_type __attribute__ ((aligned(64)));

inline
static const char* label() { return "AVX512"; }

template<typename,int>
friend class Vector;

Expand Down

0 comments on commit 8facb32

Please sign in to comment.