diff --git a/Code_Exercises/CMakeLists.txt b/Code_Exercises/CMakeLists.txt index 7778a5b4..4e49060d 100644 --- a/Code_Exercises/CMakeLists.txt +++ b/Code_Exercises/CMakeLists.txt @@ -66,5 +66,5 @@ add_subdirectory(Local_Memory_Tiling) add_subdirectory(Work_Group_Sizes) add_subdirectory(Matrix_Transpose) add_subdirectory(Functors) -add_subdirectory(OneMKL_gemm) +add_subdirectory(oneMath_gemm) add_subdirectory(More_SYCL_Features) diff --git a/Code_Exercises/OneMKL_gemm/CMakeLists.txt b/Code_Exercises/OneMKL_gemm/CMakeLists.txt deleted file mode 100644 index becd8121..00000000 --- a/Code_Exercises/OneMKL_gemm/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -#[[ - SYCL Academy (c) - - SYCL Academy is licensed under a Creative Commons Attribution-ShareAlike 4.0 - International License. - - You should have received a copy of the license along with this work. If not, - see . -]] - -add_sycl_executable(OneMKL_gemm source_onemkl_usm_gemm) -add_sycl_executable(OneMKL_gemm source_onemkl_buffer_gemm) - -target_link_libraries(OneMKL_gemm_source_onemkl_usm_gemm PUBLIC -L$ENV{MKLROOT} -lonemkl) -target_link_libraries(OneMKL_gemm_source_onemkl_buffer_gemm PUBLIC -L$ENV{MKLROOT} -lonemkl) -if(SYCL_ACADEMY_ENABLE_SOLUTIONS) - add_sycl_executable(OneMKL_gemm solution_onemkl_usm_gemm) - add_sycl_executable(OneMKL_gemm solution_onemkl_buffer_gemm) - - target_link_libraries(OneMKL_gemm_solution_onemkl_usm_gemm PUBLIC -L$ENV{MKLROOT} -lonemkl) - target_link_libraries(OneMKL_gemm_solution_onemkl_buffer_gemm PUBLIC -L$ENV{MKLROOT} -lonemkl) -endif() diff --git a/Code_Exercises/OneMKL_gemm/README.md b/Code_Exercises/OneMKL_gemm/README.md deleted file mode 100644 index 4d27768b..00000000 --- a/Code_Exercises/OneMKL_gemm/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# SYCL Academy - -## Use of oneMKL library ---- - -In this exercise you will learn how to make use of APIs from oneMKL Interfaces library. -More specifically to perform a matrix multiplication using GEMM. - -The source code provides template to perform GEMM using oneMKL's USM/buffer API: -Please refer to the API here: https://spec.oneapi.io/versions/latest/elements/oneMKL/source/domains/blas/gemm.html - ---- -## Exercise `OneMKL_usm_gemm/source.cpp` - -The source code invloves matrix array initialization on host and generate reference results on the host. -`source_*.cpp` has templates with TODOs to complete the exercise - -## Build and execution hints - -To run the example: ./OneMKL_usm_gemm_solution (or) ./OneMKL_usm_gemm_source -To verify with CUBLAS debug info, `export CUBLAS_LOGINFO_DB=1` and `export CUBLAS_LOGDEST_DBG=stdout` - -For DevCloud via JupiterLab follow these [instructions](../devcloudJupyter.md). - -For DPC++: [instructions](../dpcpp.md). - -For AdaptiveCpp: [instructions](../adaptivecpp.md). diff --git a/Code_Exercises/oneMath_gemm/CMakeLists.txt b/Code_Exercises/oneMath_gemm/CMakeLists.txt new file mode 100644 index 00000000..7843b6ed --- /dev/null +++ b/Code_Exercises/oneMath_gemm/CMakeLists.txt @@ -0,0 +1,22 @@ +#[[ + SYCL Academy (c) + + SYCL Academy is licensed under a Creative Commons Attribution-ShareAlike 4.0 + International License. + + You should have received a copy of the license along with this work. If not, + see . +]] + +add_sycl_executable(oneMath_gemm source_onemath_usm_gemm) +add_sycl_executable(oneMath_gemm source_onemath_buffer_gemm) + +target_link_libraries(oneMath_gemm_source_onemath_usm_gemm PUBLIC -lonemath) +target_link_libraries(oneMath_gemm_source_onemath_buffer_gemm PUBLIC -lonemath) +if(SYCL_ACADEMY_ENABLE_SOLUTIONS) + add_sycl_executable(oneMath_gemm solution_onemath_usm_gemm) + add_sycl_executable(oneMath_gemm solution_onemath_buffer_gemm) + + target_link_libraries(oneMath_gemm_solution_onemath_usm_gemm PUBLIC -lonemath) + target_link_libraries(oneMath_gemm_solution_onemath_buffer_gemm PUBLIC -lonemath) +endif() diff --git a/Code_Exercises/oneMath_gemm/README.md b/Code_Exercises/oneMath_gemm/README.md new file mode 100644 index 00000000..da86aa37 --- /dev/null +++ b/Code_Exercises/oneMath_gemm/README.md @@ -0,0 +1,29 @@ +# SYCL Academy + +## Exercise 11: Using the oneMath library for matrix multiplication +--- + +In this exercise you will learn how to use the API of the oneMath library and +perform a matrix multiplication using the GEMM routines. + +The source code provides a template to perform GEMM using oneMath's USM/buffer +API. Please refer to the API here: +https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemath/source/domains/blas/gemm + +--- +## Exercise `oneMath_usm_gemm/source.cpp` + +The `source_*.cpp` already include all the code to create input matrices and +compute a reference result serially on host. The exercise is to fill in the +sections marked with "TODO" comments to perform GEMM on a device using oneMath. + +## Build and execution hints + +To run the example: ./oneMath_usm_gemm_solution (or) ./oneMath_usm_gemm_source +To verify with CUBLAS debug info, `export CUBLAS_LOGINFO_DB=1` and `export CUBLAS_LOGDEST_DBG=stdout` + +For DevCloud via JupiterLab follow these [instructions](../devcloudJupyter.md). + +For DPC++: [instructions](../dpcpp.md). + +For AdaptiveCpp: [instructions](../adaptivecpp.md). diff --git a/Code_Exercises/OneMKL_gemm/solution_onemkl_buffer_gemm.cpp b/Code_Exercises/oneMath_gemm/solution_onemath_buffer_gemm.cpp similarity index 72% rename from Code_Exercises/OneMKL_gemm/solution_onemkl_buffer_gemm.cpp rename to Code_Exercises/oneMath_gemm/solution_onemath_buffer_gemm.cpp index acfbb592..a8c006ef 100644 --- a/Code_Exercises/OneMKL_gemm/solution_onemkl_buffer_gemm.cpp +++ b/Code_Exercises/oneMath_gemm/solution_onemath_buffer_gemm.cpp @@ -7,20 +7,20 @@ You should have received a copy of the license along with this work. If not, see . - SYCL Quick Reference + Quick Reference ~~~~~~~~~~~~~~~~~~~~ - // oneMKL APIs: - https://spec.oneapi.io/versions/latest/elements/oneMKL/source/domains/blas/gemm.html#onemkl-blas-gemm + oneMath execution model: + https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemath/source/architecture/architecture - // DGEMM: - https://www.intel.com/content/www/us/en/docs/onemkl/tutorial-c/2021-4/multiplying-matrices-using-dgemm.html + oneMath GEMM API: + https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemath/source/domains/blas/gemm */ #include #include -#include +#include #include #include @@ -62,12 +62,12 @@ int VerifyResult(sycl::host_accessor& c_A, T* c_B) { ////////////////////////////////////////////////////////////////////////////////////////// void print_device_info(sycl::queue& Q) { - std::string sycl_dev_name, sycl_runtime, sycl_driver; + std::string sycl_dev_name, sycl_dev_version, sycl_driver; sycl_dev_name = Q.get_device().get_info(); sycl_driver = Q.get_device().get_info(); - sycl_runtime = Q.get_device().get_info(); - std::cout << "Running on " << sycl_dev_name.c_str() << ", SYCL runtime: v" - << sycl_runtime.c_str() + sycl_dev_version = Q.get_device().get_info(); + std::cout << "Running on " << sycl_dev_name.c_str() + << ", version: " << sycl_dev_version.c_str() << ", driver version: " << sycl_driver.c_str() << std::endl; } @@ -117,28 +117,30 @@ int main() { } } - // Create a SYCL in-order queue targetting GPU device - sycl::queue Q{sycl::gpu_selector_v, sycl::property::queue::in_order{}}; + // Create a SYCL queue + sycl::queue Q; // Prints some basic info related to the hardware print_device_info(Q); - // TODO: Allocate memory on device, (using sycl::malloc_device APIs) - // Creating 1D buffers for matrices which are bound to host memory array + // Create 1D buffers for matrices which are bound to host memory arrays sycl::buffer a{A.data(), sycl::range<1>{M * N}}; sycl::buffer b{B.data(), sycl::range<1>{N * P}}; sycl::buffer c{C_host.data(), sycl::range<1>{M * P}}; - // TODO: Use oneMKL GEMM USM API - oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans; - oneapi::mkl::transpose transB = oneapi::mkl::transpose::nontrans; - oneapi::mkl::blas::column_major::gemm(Q, transA, transB, n, m, k, alpha, b, - ldB, a, ldA, beta, c, ldC); - Q.wait(); + // Use oneMath GEMM buffer API + oneapi::math::transpose transA = oneapi::math::transpose::nontrans; + oneapi::math::transpose transB = oneapi::math::transpose::nontrans; + oneapi::math::blas::column_major::gemm(Q, transA, transB, n, m, k, alpha, b, + ldB, a, ldA, beta, c, ldC); + + // Host accessor ensures synchronisation: a read operation on the accessor + // will wait until all kernels writing to buffer "c" finished executing and + // then copy the data back to host sycl::host_accessor C_device{c}; - // Verify results from oneMKL APIs + // Verify results from oneMath int result = 0; - std::cout << "Verify results between OneMKL & Serial: "; + std::cout << "Verify results between oneMath & serial: "; result = VerifyResult(C_device, C_host.data()); return result; diff --git a/Code_Exercises/OneMKL_gemm/solution_onemkl_usm_gemm.cpp b/Code_Exercises/oneMath_gemm/solution_onemath_usm_gemm.cpp similarity index 66% rename from Code_Exercises/OneMKL_gemm/solution_onemkl_usm_gemm.cpp rename to Code_Exercises/oneMath_gemm/solution_onemath_usm_gemm.cpp index 79f61f77..24c6dc56 100644 --- a/Code_Exercises/OneMKL_gemm/solution_onemkl_usm_gemm.cpp +++ b/Code_Exercises/oneMath_gemm/solution_onemath_usm_gemm.cpp @@ -7,20 +7,20 @@ You should have received a copy of the license along with this work. If not, see . - SYCL Quick Reference + Quick Reference ~~~~~~~~~~~~~~~~~~~~ - // oneMKL APIs: - https://spec.oneapi.io/versions/latest/elements/oneMKL/source/domains/blas/gemm.html#onemkl-blas-gemm + oneMath execution model: + https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemath/source/architecture/architecture - // DGEMM: - https://www.intel.com/content/www/us/en/docs/onemkl/tutorial-c/2021-4/multiplying-matrices-using-dgemm.html + oneMath GEMM API: + https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemath/source/domains/blas/gemm */ #include #include -#include +#include #include #include @@ -62,12 +62,12 @@ int VerifyResult(T* c_A, T* c_B) { ////////////////////////////////////////////////////////////////////////////////////////// void print_device_info(sycl::queue& Q) { - std::string sycl_dev_name, sycl_runtime, sycl_driver; + std::string sycl_dev_name, sycl_dev_version, sycl_driver; sycl_dev_name = Q.get_device().get_info(); sycl_driver = Q.get_device().get_info(); - sycl_runtime = Q.get_device().get_info(); - std::cout << "Running on " << sycl_dev_name.c_str() << ", SYCL runtime: v" - << sycl_runtime.c_str() + sycl_dev_version = Q.get_device().get_info(); + std::cout << "Running on " << sycl_dev_name.c_str() + << ", version: " << sycl_dev_version.c_str() << ", driver version: " << sycl_driver.c_str() << std::endl; } @@ -117,35 +117,42 @@ int main() { } } - // Create a SYCL in-order queue targetting GPU device - sycl::queue Q{sycl::gpu_selector_v, sycl::property::queue::in_order{}}; + // Create a SYCL queue + sycl::queue Q; // Prints some basic info related to the hardware print_device_info(Q); - // TODO: Allocate memory on device, (using sycl::malloc_device APIs) + // Allocate memory on device, (using sycl::malloc_device APIs) T* a = sycl::malloc_device((M * N), Q); T* b = sycl::malloc_device((N * P), Q); T* c = sycl::malloc_device((M * P), Q); - Q.memcpy(a, A.data(), sizeof(T) * M * N); - Q.memcpy(b, B.data(), sizeof(T) * N * P); - - // TODO: Use oneMKL GEMM USM API - oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans; - oneapi::mkl::transpose transB = oneapi::mkl::transpose::nontrans; - oneapi::mkl::blas::column_major::gemm(Q, transA, transB, n, m, k, alpha, b, - ldB, a, ldA, beta, c, - ldC); // row-major - + sycl::event eventCopyA = Q.memcpy(a, A.data(), sizeof(T) * M * N); + sycl::event eventCopyB = Q.memcpy(b, B.data(), sizeof(T) * N * P); + + // Use oneMath GEMM USM API + oneapi::math::transpose transA = oneapi::math::transpose::nontrans; + oneapi::math::transpose transB = oneapi::math::transpose::nontrans; + // Pass the synchronisation events to ensure GEMM starts after inputs are + // fully copied to the device + sycl::event eventGEMM = oneapi::math::blas::column_major::gemm( + Q, transA, transB, n, m, k, alpha, b, ldB, a, ldA, beta, c, ldC, + {eventCopyA, eventCopyB}); // row-major + + // Copy the results from device to host for verification std::vector C_device(M * P); - Q.memcpy(C_device.data(), c, sizeof(T) * M * P); - Q.wait(); + // Pass the synchronisation event for the copy to wait until GEMM is finished + sycl::event eventCopyC = + Q.memcpy(C_device.data(), c, sizeof(T) * M * P, eventGEMM); + + // Wait for the copy to finish + eventCopyC.wait(); - // Verify results from oneMKL APIs + // Verify results from oneMath int result = 0; - std::cout << "Verify results between OneMKL & Serial: "; + std::cout << "Verify results between oneMath & serial: "; result = VerifyResult(C_device.data(), C_host.data()); - // TODO: Free memory from device + // Free memory from device sycl::free(a, Q); sycl::free(b, Q); sycl::free(c, Q); diff --git a/Code_Exercises/OneMKL_gemm/source_onemkl_buffer_gemm.cpp b/Code_Exercises/oneMath_gemm/source_onemath_buffer_gemm.cpp similarity index 78% rename from Code_Exercises/OneMKL_gemm/source_onemkl_buffer_gemm.cpp rename to Code_Exercises/oneMath_gemm/source_onemath_buffer_gemm.cpp index 14235b27..700fe506 100644 --- a/Code_Exercises/OneMKL_gemm/source_onemkl_buffer_gemm.cpp +++ b/Code_Exercises/oneMath_gemm/source_onemath_buffer_gemm.cpp @@ -7,20 +7,20 @@ You should have received a copy of the license along with this work. If not, see . - SYCL Quick Reference + Quick Reference ~~~~~~~~~~~~~~~~~~~~ - // oneMKL APIs: - https://spec.oneapi.io/versions/latest/elements/oneMKL/source/domains/blas/gemm.html#onemkl-blas-gemm + oneMath execution model: + https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemath/source/architecture/architecture - // DGEMM: - https://www.intel.com/content/www/us/en/docs/onemkl/tutorial-c/2021-4/multiplying-matrices-using-dgemm.html + oneMath GEMM API: + https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemath/source/domains/blas/gemm */ #include #include -#include +#include #include #include @@ -62,12 +62,12 @@ int VerifyResult(sycl::host_accessor& c_A, T* c_B) { ////////////////////////////////////////////////////////////////////////////////////////// void print_device_info(sycl::queue& Q) { - std::string sycl_dev_name, sycl_runtime, sycl_driver; + std::string sycl_dev_name, sycl_dev_version, sycl_driver; sycl_dev_name = Q.get_device().get_info(); sycl_driver = Q.get_device().get_info(); - sycl_runtime = Q.get_device().get_info(); - std::cout << "Running on " << sycl_dev_name.c_str() << ", SYCL runtime: v" - << sycl_runtime.c_str() + sycl_dev_version = Q.get_device().get_info(); + std::cout << "Running on " << sycl_dev_name.c_str() + << ", version: " << sycl_dev_version.c_str() << ", driver version: " << sycl_driver.c_str() << std::endl; } @@ -117,22 +117,21 @@ int main() { } } - // Create a SYCL in-order queue targetting GPU device - sycl::queue Q{sycl::gpu_selector_v, sycl::property::queue::in_order{}}; + // Create a SYCL queue + sycl::queue Q; // Prints some basic info related to the hardware print_device_info(Q); - // TODO: Allocate memory on device - // Creating 1D buffers for matrices which are bound to host memory array + // TODO: Create 1D buffers for matrices which are bound to host memory arrays - // TODO: Use oneMKL GEMM USM API + // TODO: Use oneMath GEMM buffer API // TODO: Copy the results from device to host for verification // Hint: Use sycl::host_accessor - // Verify results from oneMKL APIs + // Verify results from oneMath int result = 0; - std::cout << "Verify results between OneMKL & Serial: "; + std::cout << "Verify results between oneMath & serial: "; // TODO: Uncomment the following line verify the results // result = VerifyResult(C_device, C_host); diff --git a/Code_Exercises/OneMKL_gemm/source_onemkl_usm_gemm.cpp b/Code_Exercises/oneMath_gemm/source_onemath_usm_gemm.cpp similarity index 80% rename from Code_Exercises/OneMKL_gemm/source_onemkl_usm_gemm.cpp rename to Code_Exercises/oneMath_gemm/source_onemath_usm_gemm.cpp index 32fb55e4..b64b4d33 100644 --- a/Code_Exercises/OneMKL_gemm/source_onemkl_usm_gemm.cpp +++ b/Code_Exercises/oneMath_gemm/source_onemath_usm_gemm.cpp @@ -7,20 +7,20 @@ You should have received a copy of the license along with this work. If not, see . - SYCL Quick Reference + Quick Reference ~~~~~~~~~~~~~~~~~~~~ - // oneMKL APIs: - https://spec.oneapi.io/versions/latest/elements/oneMKL/source/domains/blas/gemm.html#onemkl-blas-gemm + oneMath execution model: + https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemath/source/architecture/architecture - // DGEMM: - https://www.intel.com/content/www/us/en/docs/onemkl/tutorial-c/2021-4/multiplying-matrices-using-dgemm.html + oneMath GEMM API: + https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemath/source/domains/blas/gemm */ #include #include -#include +#include #include #include @@ -62,12 +62,12 @@ int VerifyResult(T* c_A, T* c_B) { ////////////////////////////////////////////////////////////////////////////////////////// void print_device_info(sycl::queue& Q) { - std::string sycl_dev_name, sycl_runtime, sycl_driver; + std::string sycl_dev_name, sycl_dev_version, sycl_driver; sycl_dev_name = Q.get_device().get_info(); sycl_driver = Q.get_device().get_info(); - sycl_runtime = Q.get_device().get_info(); - std::cout << "Running on " << sycl_dev_name.c_str() << ", SYCL runtime: v" - << sycl_runtime.c_str() + sycl_dev_version = Q.get_device().get_info(); + std::cout << "Running on " << sycl_dev_name.c_str() + << ", version: " << sycl_dev_version.c_str() << ", driver version: " << sycl_driver.c_str() << std::endl; } @@ -117,20 +117,20 @@ int main() { } } - // Create a SYCL in-order queue targetting GPU device - sycl::queue Q{sycl::gpu_selector_v, sycl::property::queue::in_order{}}; + // Create a SYCL queue + sycl::queue Q; // Prints some basic info related to the hardware print_device_info(Q); // TODO: Allocate memory on device, (using sycl::malloc_device APIs) - // TODO: Use oneMKL GEMM USM API + // TODO: Use oneMath GEMM USM API // TODO: Copy the results from device to host for verification - // Verify results from oneMKL APIs + // Verify results from oneMath int result = 0; - std::cout << "Verify results between OneMKL & Serial: "; + std::cout << "Verify results between oneMath & serial: "; // TODO: Uncomment the following line verify the results // result = VerifyResult(C_device, C_host); diff --git a/Lesson_Materials/OneMKL_gemm/index.html b/Lesson_Materials/OneMKL_gemm/index.html deleted file mode 100644 index 31b95da5..00000000 --- a/Lesson_Materials/OneMKL_gemm/index.html +++ /dev/null @@ -1,154 +0,0 @@ - - - - - - - - - - - - - -
-
-
- - - - -
- -
-
- ## oneAPI Math Kernel Library (oneMKL) Interfaces -
-
- -
- ## Learning Objectives - * Learn about oneMKL library, more specifically oneMKL Interfaces project - * Learn about how to use GEMM APIs from oneMKL with both USM and buffer memory models -
- -
- ## Resources - * oneMKL Interfaces: https://github.com/oneapi-src/oneMKL - * oneMKL specification: https://spec.oneapi.io/versions/latest/elements/oneMKL/source/index.html# - * Important: What is the difference between the following oneMKL terms: - (1) oneAPI Specification for oneMKL - (2) oneAPI's oneMKL Interfaces Project - (3) Intel(R) oneAPI's oneMKL Product - https://github.com/oneapi-src/oneMKL?tab=readme-ov-file#onemkl -
- -
-
- #### Run-time dispatching -
-
-
-
-#include <oneapi/mkl/blas.hpp>
-
-...
-
-sycl::queue cpu_queue(sycl::cpu_selector_v);
-sycl::queue gpu_queue(sycl::gpu_selector_v);
-
-oneapi::mkl::blas::column_major::gemm(cpu_queue, transA, transB, m, ...);
-oneapi::mkl::blas::column_major::gemm(gpu_queue, transA, transB, m, ...);
-				
-
-
- * Backend is loaded at run-time based on device-vendor - * `$> icpx -fsycl –I$ONEMKL/include app.cpp` - * `$> icpx -fsycl app.o –L$ONEMKL/lib –lonemkl` -
-
-
-
-
- #### Compile-time dispatching -
-
-
-
-#include <oneapi/mkl/blas.hpp>
-
-...
-
-sycl::queue cpu_queue(sycl::cpu_selector_v);
-sycl::queue gpu_queue(sycl::gpu_selector_v);
-
-oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu> cpu_selector(cpu_queue);
-oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas> gpu_selector(gpu_queue);
-
-oneapi::mkl::blas::column_major::gemm(cpu_selector,
-                                      transA, transB, m, ...);
-oneapi::mkl::blas::column_major::gemm(gpu_selector,
-                                      transA, transB, m, ...);
-				
-
-
- * Uses a templated backend selector APIs, where the template parameters specify the backends - * Application is linked with the required oneMKL backend wrapper libraries - * `$> clang++ -fsycl –I$ONEMKL/include app.cpp` - * `$> clang++ -fsycl app.o –L$ONEMKL/lib –lonemkl_blas_mklcpu –lonemkl_blas_cublas` -
-
-
- -
- ## Exercise - * Objectives: Learn to use oneMKL GEMM buffer, USM APIs - * What is provided: - ** Boiler plate-code provided (a) to perform GEMM on CPU, (b) Helper function to verify results from oneMKL APIs and CPU - ** Please complete the TODO tasks marked in the `source_*.cpp`. - ** Refer to the solutions at `solution_*.cpp` -
-
-
- - - - - - - diff --git a/Lesson_Materials/common-revealjs/images/oneAPI.png b/Lesson_Materials/common-revealjs/images/oneAPI.png new file mode 100644 index 00000000..e822aab9 Binary files /dev/null and b/Lesson_Materials/common-revealjs/images/oneAPI.png differ diff --git a/Lesson_Materials/common-revealjs/images/oneMath-backends.svg b/Lesson_Materials/common-revealjs/images/oneMath-backends.svg new file mode 100644 index 00000000..b4269c56 --- /dev/null +++ b/Lesson_Materials/common-revealjs/images/oneMath-backends.svg @@ -0,0 +1,1427 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ oneMath
+
+
+
+ oneMath +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ BLAS
+
+
+
+ BLAS +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
Sparse BLAS
+
+
+
+
+ Sparse BLAS +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ DFT
+
+
+
+ DFT +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ RNG
+
+
+
+ RNG +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ LAPACK
+
+
+
+ LAPACK +
+
+
+
+ + + + + + + + +
+
+
Intel + oneMKL
+
+
+
+ Intel + oneMKL +
+
+
+
+ + + + + + + + +
+
+
+
Netlib
+
+
+
+
+ Netlib +
+
+
+
+ + + + + + + + +
+
+
+
generic SYCL
+
+
+
+
+ generic + SYCL +
+
+
+
+ + + + + + + + +
+
+
+
cuBLAS
+
+
+
+
+ cuBLAS +
+
+
+
+ + + + + + + + +
+
+
+
rocBLAS
+
+
+
+
+ + rocBLAS +
+
+
+
+ + + + + + + + +
+
+
x86 + CPU / Intel GPU
+
+
+
+ x86 + CPU / Intel GPU +
+
+
+
+ + + + + + + + +
+
+
x86 + CPU
+
+
+
+ x86 + CPU +
+
+
+
+ + + + + + + + +
+
+
NVIDIA + GPU
+
+
+
+ NVIDIA + GPU +
+
+
+
+ + + + + + + + +
+
+
AMD + GPU
+
+
+
+ AMD + GPU +
+
+
+
+ + + + + + + + +
+
+
Any + SYCL device
+
+
+
+ Any + SYCL device +
+
+
+
+ + + + + + + + +
+
+
Intel + oneMKL
+
+
+
+ Intel + oneMKL +
+
+
+
+ + + + + + + + +
+
+
x86 + CPU / Intel GPU
+
+
+
+ x86 + CPU / Intel GPU +
+
+
+
+ + + + + + + + +
+
+
+
cuSPARSE
+
+
+
+
+ + cuSPARSE +
+
+
+
+ + + + + + + + +
+
+
+
rocSPARSE
+
+
+
+
+ + rocSPARSE +
+
+
+
+ + + + + + + + +
+
+
NVIDIA + GPU
+
+
+
+ NVIDIA + GPU +
+
+
+
+ + + + + + + + +
+
+
AMD + GPU
+
+
+
+ AMD + GPU +
+
+
+
+ + + + + + + + +
+
+
Intel + oneMKL
+
+
+
+ Intel + oneMKL +
+
+
+
+ + + + + + + + +
+
+
x86 + CPU / Intel GPU
+
+
+
+ x86 + CPU / Intel GPU +
+
+
+
+ + + + + + + + +
+
+
+
cuFFT
+
+
+
+
+ cuFFT +
+
+
+
+ + + + + + + + +
+
+
+
rocFFT
+
+
+
+
+ + rocFFT +
+
+
+
+ + + + + + + + +
+
+
NVIDIA + GPU
+
+
+
+ NVIDIA + GPU +
+
+
+
+ + + + + + + + +
+
+
AMD + GPU
+
+
+
+ AMD + GPU +
+
+
+
+ + + + + + + + +
+
+
+
portFFT
+
+
+
+
+ + portFFT +
+
+
+
+ + + + + + + + +
+
+
Any + SYCL device
+
+
+
+ Any + SYCL device +
+
+
+
+ + + + + + + + +
+
+
Intel + oneMKL
+
+
+
+ Intel + oneMKL +
+
+
+
+ + + + + + + + +
+
+
x86 + CPU / Intel GPU
+
+
+
+ x86 + CPU / Intel GPU +
+
+
+
+ + + + + + + + +
+
+
+
cuRAND
+
+
+
+
+ + cuRAND +
+
+
+
+ + + + + + + + +
+
+
+
rocRAND
+
+
+
+
+ + rocRAND +
+
+
+
+ + + + + + + + +
+
+
NVIDIA + GPU
+
+
+
+ NVIDIA + GPU +
+
+
+
+ + + + + + + + +
+
+
AMD + GPU
+
+
+
+ AMD + GPU +
+
+
+
+ + + + + + + + +
+
+
Intel + oneMKL
+
+
+
+ Intel + oneMKL +
+
+
+
+ + + + + + + + +
+
+
x86 + CPU / Intel GPU
+
+
+
+ x86 + CPU / Intel GPU +
+
+
+
+ + + + + + + + +
+
+
+
cuSOLVER
+
+
+
+
+ + cuSOLVER +
+
+
+
+ + + + + + + + +
+
+
+
rocSOLVER
+
+
+
+
+ + rocSOLVER +
+
+
+
+ + + + + + + + +
+
+
NVIDIA + GPU
+
+
+
+ NVIDIA + GPU +
+
+
+
+ + + + + + + + +
+
+
AMD + GPU
+
+
+
+ AMD + GPU +
+
+
+
+ + + + + + + + +
+
+
+
Vector Math
and Statistics
domains not yet
implemented
+
+
+
+
+ Vector Math and Statistics domains not yet implemented +
+
+
+
+
+
+
+
\ No newline at end of file diff --git a/Lesson_Materials/common-revealjs/images/uxl.svg b/Lesson_Materials/common-revealjs/images/uxl.svg new file mode 100644 index 00000000..10ccd074 --- /dev/null +++ b/Lesson_Materials/common-revealjs/images/uxl.svg @@ -0,0 +1,205 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/Lesson_Materials/oneMath_gemm/index.html b/Lesson_Materials/oneMath_gemm/index.html new file mode 100644 index 00000000..44e278d7 --- /dev/null +++ b/Lesson_Materials/oneMath_gemm/index.html @@ -0,0 +1,208 @@ + + + + + + + + + + + + + +
+
+
+ + + + +
+ +
+

+ oneAPI Math Library (oneMath) +

+
+ +
+ ## Learning Objectives + * Learn what the oneMath is and how it works + * Learn how to use GEMM APIs from oneMath with both USM and buffer memory models +
+ +
+
+ ## Do you need to write your own kernels? +
+ +
+ * Many computationally intensive applications spend the most of their time in **common operations / algorithms** + * **Numerical libraries** provide reliable solutions to these common problems + * You can focus on solving higher-level problems instead of technical details + * Libraries optimised for specific hardware provide **superior performance** +
+
+ +
+
+ ## Numerical libraries +
+ +
+ * Common APIs like BLAS or LAPACK have multiple CPU implementations and vendor-specific GPU solutions + * **Intel CPU/GPU**: Intel Math Kernels Library (oneMKL) + * **NVIDIA GPU**: cuBLAS, cuSOLVER, cuRAND, cuFFT + * **AMD GPU**: rocBLAS, rocSOLVER, rocRAND, rocFFT + * Imagine being able to use all of them with *single source code* → **oneMath** +
+
+ +
+
+

+ oneAPI and oneMath +

+
+
+ * Open-source [**oneAPI**](https://oneapi.io/) project governed by the [United Acceleration (UXL) Foundation](https://uxlfoundation.org/): + * defines SYCL-based APIs and provides library implementations + * brings performance and ease of development to SYCL applications + * [**oneMath** specification](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onemath/source/): + * defines SYCL API for numerical computations across several domains + * Linear Algebra, Discrete Fourier Transforms, Random Number Generators, Statistics, Vector Math + * [**oneMath** library](https://github.com/uxlfoundation/oneMath): + * wrapper implementation dispatching SYCL API calls to a multitude of implementations, both generic and vendor-specific +
+
+ +
+ +
+
+ +
+

+ oneMath library backends +

+ +
+ +
+
+ #### Run-time dispatching +
+
+

+#include <oneapi/math.hpp>
+
+sycl::queue q{myDeviceSelector};
+
+sycl::buffer<T,1> a{a_host, m*k};
+sycl::buffer<T,1> b{b_host, k*n};
+sycl::buffer<T,1> c{c_host, m*n};
+
+// Compute C = A*B+C on the device
+oneapi::math::blas::column_major::gemm(q, ..., m, n, k, ..., a, ..., b, ..., c, ... );
+						
+
+
+ * Backend is loaded at run time based on the device associated with the SYCL queue + * Both buffer and USM APIs available (mind the different synchronisation) + * The same binary can run on different hardware with a generic device selector + * Can run on CPU or different GPUs without recompiling + * Link the application with the top-level runtime library: `-lonemath` +
+
+ +
+
+ #### Compile-time dispatching +
+
+

+#include <oneapi/math.hpp>
+
+sycl::queue cpu_queue{sycl::cpu_selector_v};
+
+sycl::buffer<T,1> a{a_host, m*k};
+sycl::buffer<T,1> b{b_host, k*n};
+sycl::buffer<T,1> c{c_host, m*n};
+
+oneapi::math::backend_selector<oneapi::math::backend::mklcpu> cpu_selector(cpu_queue);
+// Select the Intel oneMKL CPU backend specifically ^^^^^^
+
+oneapi::math::blas::column_major::gemm(cpu_selector, ..., m, n, k, ..., a, ..., b, ..., c, ... );
+						
+
+
+ * Specific backend can be selected at compile-time with a `backend_selector` + * Passed into the API in place of the queue + * Reduces the small dispatching overhead at the cost of removed portability + * Link the application with the specific backend library: `-lonemath_blas_mklcpu` +
+
+ +
+
+ ## Exercise +
+
+ * Objectives: Learn to use oneMath GEMM buffer and USM APIs + * Boiler-plate code already provided to: + * Initialize matrices on host + * Compute reference result on host + * Compare the host and device results + * Please **complete the TODO tasks** marked in the `source_*.cpp` + * Create buffers or transfer data with USM + * Compute GEMM by calling the oneMath API + * Use the provided `VerifyResult` function + * If stuck, have a look at `solution_*.cpp` +
+
+
+
+ + + + + + + +