Skip to content

Commit

Permalink
Updated the OpenMP matrix multiply.
Browse files Browse the repository at this point in the history
  • Loading branch information
isazi committed Aug 22, 2024
1 parent ddf501f commit 1698a9b
Showing 1 changed file with 11 additions and 4 deletions.
15 changes: 11 additions & 4 deletions examples/directives/matrix_multiply_c_openmp.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
"""This is an example tuning a naive matrix multiplication using the simplified directives interface"""

from kernel_tuner import tune_kernel
from kernel_tuner import tune_kernel, run_kernel
from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives

N = 4096
Expand All @@ -13,11 +13,11 @@
#pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN)
float temp_sum = 0.0f;
#pragma omp target
#pragma omp teams collapse(2)
#pragma omp teams distribute collapse(2)
for ( int i = 0; i < N; i++) {
for ( int j = 0; j < N; j++ ) {
temp_sum = 0.0f;
#pragma omp distribute parallel for num_threads(nthreads) reduction(+:temp_sum)
#pragma omp parallel for num_threads(nthreads) reduction(+:temp_sum)
for ( int k = 0; k < N; k++ ) {
temp_sum += A[(i * N) + k] * B[(k * N) + j];
}
Expand All @@ -40,13 +40,20 @@
metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9

# compute reference solution from CPU
results = run_kernel(
"mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"]
)
answer = [None, None, results[2]]

tune_kernel(
"mm",
kernel_string["mm"],
0,
kernel_args["mm"],
tune_params,
metrics=metrics,
compiler_options=["-fast", "-mp=gpu"],
answer=answer,
compiler="nvc++",
compiler_options=["-fast", "-mp=gpu"],
)

0 comments on commit 1698a9b

Please sign in to comment.