diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 20e792d9ae..b9556d1c46 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1315,14 +1315,11 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { * @var epsilon: The tolerance to use when comparing. * @return true if the comparison fails and false if the comparison succeeds. */ -static inline bool __gemm_print_compare_failure(view_type_3d expected, - view_type_3d actual, int i, +template +static inline bool __gemm_print_compare_failure(ViewType h_expected, + ViewType h_actual, int i, int j, int k, double epsilon) { STATUS; - typename view_type_3d::HostMirror h_expected = - Kokkos::create_mirror_view(expected); - typename view_type_3d::HostMirror h_actual = - Kokkos::create_mirror_view(actual); auto diff = static_cast(Kokkos::Experimental::fabs( static_cast(h_expected(i, j, k) - h_actual(i, j, k)))); @@ -1349,11 +1346,21 @@ static inline bool __gemm_do_compare(view_type_3d expected, double epsilon = Test::epsilon::value * 1e3; STATUS; + typename view_type_3d::HostMirror h_expected = + Kokkos::create_mirror_view(expected); + typename view_type_3d::HostMirror h_actual = + Kokkos::create_mirror_view(actual); + + // Copy to host for comparision + Kokkos::deep_copy(h_expected, expected); + Kokkos::deep_copy(h_actual, actual); + Kokkos::fence(); + if (std::is_same::value) { - for (size_t i = 0; i < expected.extent(0); i++) { - for (size_t j = 0; j < expected.extent(1); j++) { - for (size_t k = 0; k < expected.extent(2); k++) { - if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon)) + for (size_t i = 0; i < h_expected.extent(0); i++) { + for (size_t j = 0; j < h_expected.extent(1); j++) { + for (size_t k = 0; k < h_expected.extent(2); k++) { + if (__gemm_print_compare_failure(h_expected, h_actual, i, j, k, epsilon)) return true; } } @@ -1361,10 +1368,10 @@ static inline bool __gemm_do_compare(view_type_3d expected, } if (std::is_same::value) { - for (size_t k = 0; k < expected.extent(2); k++) { - for (size_t j = 0; j < expected.extent(1); j++) { - for (size_t i = 0; i < expected.extent(0); i++) { - if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon)) + for (size_t k = 0; k < h_expected.extent(2); k++) { + for (size_t j = 0; j < h_expected.extent(1); j++) { + for (size_t i = 0; i < h_expected.extent(0); i++) { + if (__gemm_print_compare_failure(h_expected, h_actual, i, j, k, epsilon)) return true; } } @@ -1380,20 +1387,28 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, options_t options) { using dst_scalar_type = typename dstViewType::value_type; using src_scalar_type = typename view_type_5d::value_type; - size_t remainder, vector_batch_size, simd_batch_size; + size_t remainder, vector_batch_size, simd_batch_size, last_batch; bool data_layout_same_as_3d_view = false; + typename dstViewType::HostMirror h_dst = + Kokkos::create_mirror_view(dst); + typename view_type_4d::HostMirror h_src = + Kokkos::create_mirror_view(src.mat_4d); + Kokkos::deep_copy(h_src, src.mat_4d); + Kokkos::fence(); if (options.blas_args.batch_size_last_dim) { - remainder = dst.extent(2) % simd_vector_size; + remainder = dst.extent(2) % simd_internal_vector_size; vector_batch_size = src.ivec_4d.extent(0); simd_batch_size = src.ivec_4d.extent(3); + last_batch = dst.extent(2); if (std::is_same::value && remainder == 0) data_layout_same_as_3d_view = true; } else { - remainder = dst.extent(0) % simd_vector_size; + remainder = dst.extent(0) % simd_internal_vector_size; vector_batch_size = src.ivec_4d.extent(3); simd_batch_size = src.ivec_4d.extent(0); + last_batch = dst.extent(0); if (std::is_same::value && remainder == 0) data_layout_same_as_3d_view = true; } @@ -1403,34 +1418,38 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, // lies in the correct location and the data can simply be cast to the 3d view. if (data_layout_same_as_3d_view) { // We can just re-cast the data to the 3d view but we'll copy it for verification - memcpy(dst.data(), src.ivec_4d.data(), + memcpy(h_dst.data(), h_src.data(), sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2)); + Kokkos::deep_copy(dst, h_dst); + Kokkos::fence(); return; } // If the remainder is 0, we have simd_vector_size sub-batches to copy out... // this is a bad data access pattern but for these perf_tests we will support it. - remainder = remainder == 0 ? simd_vector_size : remainder; + // If the remainder is non-zero, we have simd_vector_size sub-batches + remainder to + // copy out. + remainder += simd_internal_vector_size; // Views needed for slow manual copy - view_type_5d src_raw; + using h_view_type_5d = Kokkos::View; using h_subview_type_2d = Kokkos::View; using h_subview_type_3d = Kokkos::View; using h_subview_type_4d = Kokkos::View; + h_view_type_5d h_src_raw; h_subview_type_4d h_sv0; h_subview_type_3d h_sv1; h_subview_type_2d h_sv2; + // TODO: Clean everything below this point up... if (std::is_same::value) - src_raw = view_type_5d((src_scalar_type *)src.ivec_4d.data(), src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3), simd_internal_vector_size); + h_src_raw = h_view_type_5d((src_scalar_type *)h_src.data(), src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3), simd_internal_vector_size); else - src_raw = view_type_5d((src_scalar_type *)src.ivec_4d.data(), + h_src_raw = h_view_type_5d((src_scalar_type *)h_src.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); - typename view_type_5d::HostMirror h_src_raw = - Kokkos::create_mirror_view(src_raw); // The below loops copies each corresponding 2-rank matrix within the simd // view back to the 3-rank view. @@ -1457,14 +1476,19 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { if (options.blas_args.batch_size_last_dim) - dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = h_sv2(m, n); + h_dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = h_sv2(m, n); else - dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = h_sv2(m, n); + h_dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = h_sv2(m, n); } } + if (simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx == last_batch - 1) + goto out; } } } +out: + Kokkos::deep_copy(dst, h_dst); + Kokkos::fence(); } /**