Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Blas perf_test updates #892

Merged
merged 47 commits into from
Mar 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
b1d26ed
perf_test/blas:
e10harvey Feb 9, 2021
3211987
perf_test: Account for complex flop counts
e10harvey Feb 10, 2021
667fee3
perf_test: Use flop counts from lapack note 41
e10harvey Feb 10, 2021
973afc5
perf_test: Update flop counts
e10harvey Feb 10, 2021
8d28687
perf_test: Update gemm to optionally use RangePolicy
e10harvey Feb 10, 2021
ccbbad3
perf_test: Update GEMM to output GFLOPs
e10harvey Feb 10, 2021
274e928
perf_test: Update gemm size step
e10harvey Feb 10, 2021
6f4e05b
perf_test: Disable KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY
e10harvey Feb 11, 2021
b066fa9
perf_test/blas: Fix GFLOP calculation
e10harvey Feb 11, 2021
63382d3
perf_test/blas/blas3: Add bandwidth metric to trmm
e10harvey Feb 13, 2021
898794e
perf_test: Handle complex numbers in flop count
e10harvey Feb 15, 2021
f11f913
perf_test/blas/blas3: Gemm perf_test_updates
e10harvey Feb 15, 2021
fb41b4c
perf_test/blas/blas3:
e10harvey Feb 16, 2021
a91bd6c
perf_test/blas/blas3: Update csv row for --use_auto
e10harvey Feb 16, 2021
0d4fe93
perf_test/blas/blas3: Add -d option for view allocation
e10harvey Feb 17, 2021
5c729bc
perf_test/blas/blas3: Update team and team_vector for -d
e10harvey Feb 17, 2021
6da5a7b
perf_test/blas/blas3: Add simd gemm as experiment6.
e10harvey Feb 18, 2021
441c4d4
perf_test/blas/blas3: Add experiment7 (Simd + TeamGemm)
e10harvey Feb 23, 2021
3c80586
perf_test/blas/blas3: replace experiment7 with batched_team_simd
e10harvey Mar 2, 2021
b5c7b88
perf_test/batched: Add README.md
e10harvey Mar 2, 2021
d9e9d04
perf_test/blas/blas3: Add last gemm test types
e10harvey Mar 2, 2021
fa23cf7
perf_test/blas/blas3: Apply clang-format
e10harvey Mar 2, 2021
e5fb960
perf_test/blas/blas3: Allocate simd views
e10harvey Mar 2, 2021
30d5472
perf_test/blas/blas3: Update compact mkl functors
e10harvey Mar 3, 2021
2401e9d
perf_test/blas/blas3: Added operators for SerialSimd
e10harvey Mar 3, 2021
950e055
perf_test/blas/blas3: Fix compactMKL batch size
e10harvey Mar 3, 2021
1eab5b4
perf_test/blas: Fix internal function names
e10harvey Mar 3, 2021
c7e4f54
perf_test/blas/blas3: Apply clang-format
e10harvey Mar 3, 2021
147783e
perf_test/blas/blas3: Fix -d 1 for team and serial simd
e10harvey Mar 4, 2021
e3efd45
perf_test/blas/blas3: Update serial simd to use RangePolicy
e10harvey Mar 4, 2021
0127243
perf_test/blas: Update flop counts to use double
e10harvey Mar 4, 2021
4acdaf5
perf_test/blas/blas3: Added verify option
e10harvey Mar 5, 2021
0de685f
test_common: Fix half_t epsilon specialization
e10harvey Mar 5, 2021
29322e8
perf_test/blas/blas3: Use TeamPolicy for serial simd
e10harvey Mar 5, 2021
80ca02e
perf_test/blas/blas3: Process verify option
e10harvey Mar 9, 2021
239d44d
perf_test/blas/blas3: Relax epsilon
e10harvey Mar 9, 2021
55e3eb3
perf_test/blas/blas3: Add TODO for bug
e10harvey Mar 9, 2021
53aa653
perf_test/blas/blas3: Fix verify for simd when batch_size is first dim
e10harvey Mar 10, 2021
192fde6
perf_test/blas/blas3: Complete verify for batch_size in first dimension
e10harvey Mar 10, 2021
e435171
test_common: Update VanillaGEMM with batch_size_last_dim member
e10harvey Mar 10, 2021
0790673
perf_test/blas/blas3: Add batch_size_last_dim to vgemm
e10harvey Mar 10, 2021
137adcc
perf_test/blas/blas3: Update compare routines
e10harvey Mar 10, 2021
891f4bd
test_common: Fix half_t epsilon
e10harvey Mar 10, 2021
a7558b5
perf_test/blas/blas3: Update serial loops
e10harvey Mar 10, 2021
4ea0e4c
test_common: Update VanillaGemm
e10harvey Mar 10, 2021
8522c91
perf_test/blas/blas3: Updates for half_t
e10harvey Mar 11, 2021
4f9dafa
perf_test/blas: Apply clang-format
e10harvey Mar 11, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions perf_test/batched/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Batched BLAS performance tests reside in `perf_test/blas/{blas,blas3}`.
3 changes: 2 additions & 1 deletion perf_test/blas/blas/KokkosBlas_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
#define DEFAULT_STEP 3
#define DEFAULT_WARM_UP_N 100
#define DEFAULT_N 100
#define DEFAULT_K 10
#define DEFAULT_OUT &std::cout
#define DEFAULT_BLAS_ROUTINES "trtri,"

Expand Down Expand Up @@ -117,7 +118,7 @@ static std::string test_e_str[TEST_N]{"BLAS", "BATCHED"};
* @var n: Number of columns.
*/
struct matrix_dim {
int m, n;
int k, m, n;
};
typedef struct matrix_dim matrix_dim_t;

Expand Down
17 changes: 16 additions & 1 deletion perf_test/blas/blas/KokkosBlas_perf_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ static struct option long_options[] = {
{"matrix_size_step", required_argument, 0, 's'},
{"warm_up_loop", required_argument, 0, 'w'},
{"iter", required_argument, 0, 'i'},
{"batch_size", required_argument, 0, 'k'},
{"csv", required_argument, 0, 'c'},
{"routines", required_argument, 0, 'r'},
{"trtri_options", required_argument, 0, 'o'},
Expand Down Expand Up @@ -135,6 +136,11 @@ static void __print_help_blas_perf_test() {
"(default: %d)\n\n",
DEFAULT_N);

printf("\t-k, --batch_size=LEN\n");
printf("\t\tBatch size. Adds third dimension to matrices A and B.\n");
printf("\t\t\tThe value of LEN as an integer. (default: %d)\n",
DEFAULT_K);

printf("\t-c, --csv=/path/to/file.csv\n");
printf("\t\tCsv output file selection.\n");
printf(
Expand Down Expand Up @@ -166,12 +172,16 @@ int main(int argc, char **argv) {
/* set default options */
options.test = DEFAULT_TEST;
options.loop = DEFAULT_LOOP;
options.start.a.k = DEFAULT_K;
options.start.a.m = DEFAULT_MATRIX_START;
options.start.a.n = DEFAULT_MATRIX_START;
options.stop.a.k = DEFAULT_K;
options.stop.a.m = DEFAULT_MATRIX_STOP;
options.stop.a.n = DEFAULT_MATRIX_STOP;
options.start.b.k = DEFAULT_K;
options.start.b.m = DEFAULT_MATRIX_START;
options.start.b.n = DEFAULT_MATRIX_START;
options.stop.b.k = DEFAULT_K;
options.stop.b.m = DEFAULT_MATRIX_STOP;
options.stop.b.n = DEFAULT_MATRIX_STOP;
options.step = DEFAULT_STEP;
Expand All @@ -182,7 +192,7 @@ int main(int argc, char **argv) {

options.blas_args.trtri.trtri_args = DEFAULT_TRTRI_ARGS;

while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:c:r:", long_options,
while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:c:r:k:", long_options,
&option_idx)) != -1) {
switch (ret) {
case 'h': __print_help_blas_perf_test(); return 0;
Expand Down Expand Up @@ -255,6 +265,11 @@ int main(int argc, char **argv) {
case 's': options.step = atoi(optarg); break;
case 'w': options.warm_up_n = atoi(optarg); break;
case 'i': options.n = atoi(optarg); break;
case 'k':
options.start.a.k = options.stop.a.k =
options.start.b.k = options.stop.b.k =
atoi(optarg);
break;
case 'c':
out_file = optarg;
options.out_file = std::string(out_file);
Expand Down
172 changes: 132 additions & 40 deletions perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,64 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = {
/*************************** Test types and defaults **************************/
#define DEFAULT_TRTRI_ARGS "UU"

/**
* The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks
* of the A matrix. a_m subblocks are selected.
*/
static inline double __trtri_impl_flop_count(double a_m, double a_n) {
double flop_count = 0;
double flops_per_div, flops_per_mul, flops_per_add;

if (std::is_same<double, default_scalar>::value ||
std::is_same<float, default_scalar>::value ||
std::is_same<Kokkos::Experimental::half_t, default_scalar>::value) {
flops_per_div = 1;
flops_per_mul = 1;
flops_per_add = 1;
} else {
// For complex, we need to count 2 flops for each add and 6 flops for each multiply or divide.
flops_per_div = 6;
flops_per_mul = 6;
flops_per_add = 2;
}

for (int i = 0; i < a_m; i++) {
flop_count += flops_per_div; // 1 / A[i,j]
flop_count += ((i * (i + 1)) / 2) * (flops_per_mul + flops_per_add); // TRMM FLOPS
flop_count += i * flops_per_mul; // SCAL FLOPS
}

return flop_count;
}

// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
static inline double __trtri_flop_count(double a_m, double a_n) {
double flops;
double flops_per_mul;
double flops_per_add;

if (a_m != a_n) {
fprintf(stderr, "%s:%d:ERROR: a_m != a_n.\n", __FILE__, __LINE__);
exit(255);
}

if (std::is_same<double, default_scalar>::value ||
std::is_same<float, default_scalar>::value ||
std::is_same<Kokkos::Experimental::half_t, default_scalar>::value) {
flops_per_mul = 1;
flops_per_add = 1;
} else {
// For complex, we need to count 2 flops for each add and 6 flops for each multiply.
flops_per_mul = 6;
flops_per_add = 2;
}

flops = (1./6.*a_n*a_n*a_n + 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_mul +
(1./6.*a_n*a_n*a_n - 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_add;

return flops;
}

using view_type_3d =
Kokkos::View<default_scalar***, default_layout, default_device>;
struct trtri_args {
Expand All @@ -87,18 +145,25 @@ struct trtri_args {
typedef struct trtri_args trtri_args_t;

static std::string trtri_csv_header_str =
"algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,warm_up_n,iter,"
"total_time(s),average_time(s)";
"algorithm,side-uplo-trans-diag,loop_type,A_dims,warm_up_n,iter,"
"total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)";

/*************************** Internal helper fns **************************/
static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args,
double time_in_seconds) {
double flops = trtri_args.A.extent(0) * __trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2));
double gflops = flops / 1e9;
double average_time = time_in_seconds / options.n;

options.out[0] << test_e_str[options.test] << ","
<< options.blas_args.trtri.trtri_args << ","
<< loop_e_str[options.loop] << "," << trtri_args.A.extent(1)
<< loop_e_str[options.loop] << "," << trtri_args.A.extent(0) << "x" << trtri_args.A.extent(1)
<< "x" << trtri_args.A.extent(2) << "," << options.warm_up_n
<< "," << options.n << "," << time_in_seconds << ","
<< time_in_seconds / options.n << std::endl;
<< average_time << ","
<< flops << ","
<< gflops / average_time
<< std::endl;
}

static void __print_trtri_perf_test_options(options_t options) {
Expand Down Expand Up @@ -133,19 +198,26 @@ void __do_trtri_serial_blas(options_t options, trtri_args_t trtri_args) {

STATUS;

for (uint32_t i = 0; i < warm_up_n; ++i) {
auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());
for (uint32_t j = 0; j < warm_up_n; ++j) {
for (int i = 0; i < options.start.a.k; ++i) {
auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());

KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A);
KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A);
}
// Fence after each batch operation
Kokkos::fence();
}

timer.reset();
for (uint32_t i = 0; i < n; ++i) {
auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());
for (uint32_t j = 0; j < n; ++j) {
for (int i = 0; i < options.start.a.k; ++i) {
auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());

KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A);
KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A);
}
// Fence after each batch operation
Kokkos::fence();
}
Kokkos::fence();
__trtri_output_csv_row(options, trtri_args, timer.seconds());
#else
std::cerr << std::string(__func__)
Expand All @@ -164,19 +236,26 @@ void __do_trtri_serial_batched_template(options_t options,
Kokkos::Timer timer;
using tag = Algo::Trtri::Unblocked;

for (uint32_t i = 0; i < warm_up_n; ++i) {
auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());
for (uint32_t j = 0; j < warm_up_n; ++j) {
for (int i = 0; i < options.start.a.k; ++i) {
auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());

SerialTrtri<uplo, diag, tag>::invoke(A);
SerialTrtri<uplo, diag, tag>::invoke(A);
}
// Fence after each batch operation
Kokkos::fence();
}

timer.reset();
for (uint32_t i = 0; i < n; ++i) {
auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());
for (uint32_t j = 0; j < n; ++j) {
for (int i = 0; i < options.start.a.k; ++i) {
auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());

SerialTrtri<uplo, diag, tag>::invoke(A);
SerialTrtri<uplo, diag, tag>::invoke(A);
}
// Fence after each batch operation
Kokkos::fence();
}
Kokkos::fence();
__trtri_output_csv_row(options, trtri_args, timer.seconds());
#else
std::cerr << std::string(__func__)
Expand Down Expand Up @@ -241,16 +320,22 @@ void __do_trtri_parallel_blas(options_t options, trtri_args_t trtri_args) {

STATUS;

Kokkos::parallel_for("parallelBlasWarmUpLoopTrtri",
Kokkos::RangePolicy<execution_space>(0, warm_up_n),
parallel_blas_trtri_functor);
Kokkos::fence();
for (uint32_t i = 0; i < warm_up_n; ++i) {
Kokkos::parallel_for("parallelBlasWarmUpLoopTrtri",
Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
parallel_blas_trtri_functor);
// Fence after each batch operation
Kokkos::fence();
}

timer.reset();
Kokkos::parallel_for("parallelBlasTimedLoopTrtri",
Kokkos::RangePolicy<execution_space>(0, n),
parallel_blas_trtri_functor);
Kokkos::fence();
for (uint32_t i = 0; i < n; ++i) {
Kokkos::parallel_for("parallelBlasTimedLoopTrtri",
Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
parallel_blas_trtri_functor);
// Fence after each batch operation
Kokkos::fence();
}
__trtri_output_csv_row(options, trtri_args, timer.seconds());
#else
std::cerr << std::string(__func__)
Expand Down Expand Up @@ -287,16 +372,23 @@ void __do_trtri_parallel_batched_template(options_t options,

STATUS;

Kokkos::parallel_for("parallelBatchedWarmUpLoopTrtri",
Kokkos::RangePolicy<execution_space>(0, warm_up_n),
parallel_batched_trtri_functor);
Kokkos::fence();
for (uint32_t i = 0; i < warm_up_n; ++i) {
Kokkos::parallel_for("parallelBatchedWarmUpLoopTrtri",
Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
parallel_batched_trtri_functor);
// Fence after each batch operation
Kokkos::fence();
}

timer.reset();
Kokkos::parallel_for("parallelBatchedTimedLoopTrtri",
Kokkos::RangePolicy<execution_space>(0, n),
parallel_batched_trtri_functor);
Kokkos::fence();

for (uint32_t i = 0; i < n; ++i) {
Kokkos::parallel_for("parallelBatchedTimedLoopTrtri",
Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
parallel_batched_trtri_functor);
// Fence after each batch operation
Kokkos::fence();
}
__trtri_output_csv_row(options, trtri_args, timer.seconds());

return;
Expand Down Expand Up @@ -345,7 +437,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) {

trtri_args.uplo = options.blas_args.trtri.trtri_args.c_str()[0];
trtri_args.diag = options.blas_args.trtri.trtri_args.c_str()[1];
trtri_args.A = vta("trtri_args.A", options.n, dim.a.m, dim.a.n);
trtri_args.A = vta("trtri_args.A", dim.a.k, dim.a.m, dim.a.n);
host_A = Kokkos::create_mirror_view(trtri_args.A);

Kokkos::fill_random(trtri_args.A, rand_pool,
Expand All @@ -355,7 +447,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) {

if (trtri_args.uplo == 'U' || trtri_args.uplo == 'u') {
// Make A upper triangular
for (uint32_t k = 0; k < options.n; ++k) {
for (int k = 0; k < dim.a.k; ++k) {
auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL());
for (int i = 1; i < dim.a.m; i++) {
for (int j = 0; j < i; j++) {
Expand All @@ -367,7 +459,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) {
// Make A lower triangular
// Kokkos::parallel_for("toLowerLoop", options.n, KOKKOS_LAMBDA (const int&
// i) {
for (uint32_t k = 0; k < options.n; ++k) {
for (int k = 0; k < dim.a.k; ++k) {
auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL());
for (int i = 0; i < dim.a.m - 1; i++) {
for (int j = i + 1; j < dim.a.n; j++) {
Expand All @@ -378,7 +470,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) {
}

if (trtri_args.diag == 'U' || trtri_args.diag == 'u') {
for (uint32_t k = 0; k < options.n; ++k) {
for (int k = 0; k < dim.a.k; ++k) {
auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL());
for (int i = 0; i < min_dim; i++) {
A(i, i) = scalar_type(1);
Expand Down Expand Up @@ -408,8 +500,8 @@ void __do_loop_and_invoke(options_t options,
for (cur_dims = options.start;
cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n &&
cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n;
cur_dims.a.m *= options.step, cur_dims.a.n *= options.step,
cur_dims.b.m *= options.step, cur_dims.b.n *= options.step) {
cur_dims.a.m += options.step, cur_dims.a.n += options.step,
cur_dims.b.m += options.step, cur_dims.b.n += options.step) {
trtri_args = __do_setup<default_scalar, view_type_3d, default_device>(
options, cur_dims);
fn(options, trtri_args);
Expand Down
1 change: 1 addition & 0 deletions perf_test/blas/blas3/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/tpls/gtest)

KOKKOSKERNELS_ADD_EXECUTABLE(
KokkosBlas3_perf_test
Expand Down
Loading