From f1091a68c1850e64baf7ab36903c7d7db4e9eaf0 Mon Sep 17 00:00:00 2001 From: Rafal Bielski Date: Wed, 15 Jan 2025 14:53:37 +0000 Subject: [PATCH] Merge the ND-Range Kernel lesson into Data Parallelism (#388) The Data Parallelism and ND-Range Kernel lessons duplicated a lot of material, which became confusing when we tried presenting both in a workshop. The duplication is unnecessary and also adds to the maintenance cost. Merge the "ND-Range Kernel" lesson into "Data Parallelism" by moving all the unique slides from the latter into the former and reordering so it has a reasonable flow. The exercises were also similar, where "Data Parallelism" tested sycl::range + sycl::id version and ND-Range tested sycl::range + sycl::item as well as sycl::nd_range + sycl::nd_item. Merge the two by using most of the code from the latter, but using sycl::id instead of sycl::item. Merge the README instructions from both to retain full detail. The top-level README is adjusted to renumber all lessons after the removed one. Constant memory is deprecated in SYCL 2020, so removed it from the slides about the SYCL memory model. --- Code_Exercises/CMakeLists.txt | 1 - Code_Exercises/Data_Parallelism/README.md | 13 +- Code_Exercises/Data_Parallelism/solution.cpp | 79 +++- Code_Exercises/Data_Parallelism/source.cpp | 10 +- Code_Exercises/ND_Range_Kernel/CMakeLists.txt | 14 - Code_Exercises/ND_Range_Kernel/README.md | 48 -- Code_Exercises/ND_Range_Kernel/solution.cpp | 98 ---- Code_Exercises/ND_Range_Kernel/source.cpp | 75 --- Lesson_Materials/Data_Parallelism/index.html | 363 ++++++++++++--- Lesson_Materials/ND_Range_Kernel/index.html | 431 ------------------ README.md | 100 ++-- 11 files changed, 435 insertions(+), 797 deletions(-) delete mode 100644 Code_Exercises/ND_Range_Kernel/CMakeLists.txt delete mode 100644 Code_Exercises/ND_Range_Kernel/README.md delete mode 100644 Code_Exercises/ND_Range_Kernel/solution.cpp delete mode 100644 Code_Exercises/ND_Range_Kernel/source.cpp delete mode 100644 Lesson_Materials/ND_Range_Kernel/index.html diff --git a/Code_Exercises/CMakeLists.txt b/Code_Exercises/CMakeLists.txt index 4e49060d..a0bc251c 100644 --- a/Code_Exercises/CMakeLists.txt +++ b/Code_Exercises/CMakeLists.txt @@ -58,7 +58,6 @@ add_subdirectory(Data_and_Dependencies) add_subdirectory(In_Order_Queue) add_subdirectory(Advanced_Data_Flow) add_subdirectory(Multiple_Devices) -add_subdirectory(ND_Range_Kernel) add_subdirectory(Image_Convolution) add_subdirectory(Coalesced_Global_Memory) add_subdirectory(Vectors) diff --git a/Code_Exercises/Data_Parallelism/README.md b/Code_Exercises/Data_Parallelism/README.md index fafa95ef..597ff6c9 100644 --- a/Code_Exercises/Data_Parallelism/README.md +++ b/Code_Exercises/Data_Parallelism/README.md @@ -27,13 +27,20 @@ Create `accessor`s to each of the `buffer`s within the command group function. Now enqueue parallel kernel function by calling `parallel_for` on the `handler`. -This function takes a `range` specifying the number of iterations of the kernel -function to invoke and the kernel function itself must take an `id` which -represents the current iteration. +#### 4.1 ) Use the `range` and `id` variant +This version of `parallel_for` takes a `range` specifying the number of +iterations of the kernel function to invoke and the kernel function itself must +take an `id` which represents the current iteration. The `id` can be used in the `accessor` subscript operator to access or assign to the corresponding element of data that the accessor represents. +#### 4.2 ) Use the `nd_range` and `nd_item` variant +This version of `parallel_for` takes an `nd_range` which is made up of two +`range`s describing the global range and the local range (work-group size). The +kernel function must take an `nd_item`, which cannot be passed directly to the +subscript operator of an `accessor`. Instead, retrieve the `id` using the +`get_global_id` member function. #### Build And Execution Hints diff --git a/Code_Exercises/Data_Parallelism/solution.cpp b/Code_Exercises/Data_Parallelism/solution.cpp index 2c4a17eb..efcbca62 100644 --- a/Code_Exercises/Data_Parallelism/solution.cpp +++ b/Code_Exercises/Data_Parallelism/solution.cpp @@ -12,16 +12,54 @@ #include "../helpers.hpp" -class vector_add; +class vector_add_1; +class vector_add_2; -int main() { +void test_range() { + constexpr size_t dataSize = 1024; + + int a[dataSize], b[dataSize], r[dataSize]; + for (int i = 0; i < dataSize; ++i) { + a[i] = i; + b[i] = i; + r[i] = 0; + } + + try { + auto defaultQueue = sycl::queue{}; + + auto bufA = sycl::buffer{a, sycl::range{dataSize}}; + auto bufB = sycl::buffer{b, sycl::range{dataSize}}; + auto bufR = sycl::buffer{r, sycl::range{dataSize}}; + + defaultQueue.submit([&](sycl::handler& cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_only}; + sycl::accessor accB{bufB, cgh, sycl::read_only}; + sycl::accessor accR{bufR, cgh, sycl::write_only}; + + cgh.parallel_for( + sycl::range{dataSize}, [=](sycl::id<1> globalId) { + accR[globalId] = accA[globalId] + accB[globalId]; + }); + }); + + defaultQueue.throw_asynchronous(); + } catch (const sycl::exception& e) { + std::cout << "Exception caught: " << e.what() << std::endl; + } + + SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; }); +} + +void test_nd_range() { constexpr size_t dataSize = 1024; + constexpr size_t workGroupSize = 128; - float a[dataSize], b[dataSize], r[dataSize]; + int a[dataSize], b[dataSize], r[dataSize]; for (int i = 0; i < dataSize; ++i) { - a[i] = static_cast(i); - b[i] = static_cast(i); - r[i] = 0.0f; + a[i] = i; + b[i] = i; + r[i] = 0; } try { @@ -31,22 +69,29 @@ int main() { auto bufB = sycl::buffer{b, sycl::range{dataSize}}; auto bufR = sycl::buffer{r, sycl::range{dataSize}}; - defaultQueue - .submit([&](sycl::handler& cgh) { - sycl::accessor accA{bufA, cgh, sycl::read_only}; - sycl::accessor accB{bufB, cgh, sycl::read_only}; - sycl::accessor accR{bufR, cgh, sycl::write_only}; + defaultQueue.submit([&](sycl::handler& cgh) { + sycl::accessor accA{bufA, cgh, sycl::read_only}; + sycl::accessor accB{bufB, cgh, sycl::read_only}; + sycl::accessor accR{bufR, cgh, sycl::write_only}; + + auto ndRange = + sycl::nd_range{sycl::range{dataSize}, sycl::range{workGroupSize}}; - cgh.parallel_for( - sycl::range{dataSize}, - [=](sycl::id<1> idx) { accR[idx] = accA[idx] + accB[idx]; }); - }) - .wait(); + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> itm) { + sycl::id globalId = itm.get_global_id(); + accR[globalId] = accA[globalId] + accB[globalId]; + }); + }); defaultQueue.throw_asynchronous(); } catch (const sycl::exception& e) { std::cout << "Exception caught: " << e.what() << std::endl; } - SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2.0f; }); + SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; }); +} + +int main() { + test_range(); + test_nd_range(); } diff --git a/Code_Exercises/Data_Parallelism/source.cpp b/Code_Exercises/Data_Parallelism/source.cpp index 323e7527..b5a3efae 100644 --- a/Code_Exercises/Data_Parallelism/source.cpp +++ b/Code_Exercises/Data_Parallelism/source.cpp @@ -44,11 +44,11 @@ int main() { constexpr size_t dataSize = 1024; - float a[dataSize], b[dataSize], r[dataSize]; + int a[dataSize], b[dataSize], r[dataSize]; for (int i = 0; i < dataSize; ++i) { - a[i] = static_cast(i); - b[i] = static_cast(i); - r[i] = 0.0f; + a[i] = i; + b[i] = i; + r[i] = 0; } // Task: Compute r[i] = a[i] + b[i] in parallel on the SYCL device @@ -56,5 +56,5 @@ int main() { r[i] = a[i] + b[i]; } - SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2.0f; }); + SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; }); } diff --git a/Code_Exercises/ND_Range_Kernel/CMakeLists.txt b/Code_Exercises/ND_Range_Kernel/CMakeLists.txt deleted file mode 100644 index e9019b2f..00000000 --- a/Code_Exercises/ND_Range_Kernel/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -#[[ - SYCL Academy (c) - - SYCL Academy is licensed under a Creative Commons Attribution-ShareAlike 4.0 - International License. - - You should have received a copy of the license along with this work. If not, - see . -]] - -add_sycl_executable(ND_Range_Kernel source) -if(SYCL_ACADEMY_ENABLE_SOLUTIONS) - add_sycl_executable(ND_Range_Kernel solution) -endif() diff --git a/Code_Exercises/ND_Range_Kernel/README.md b/Code_Exercises/ND_Range_Kernel/README.md deleted file mode 100644 index e11a73be..00000000 --- a/Code_Exercises/ND_Range_Kernel/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# SYCL Academy - -## ND Range Kernel ---- - -In this exercise you will learn how to enqueue ND range kernel functions. - ---- - -### 1.) Use items in parallel_for - -Using the application from any exercise so far or creating a new one, enqueue a -kernel function using the `parallel_for` variant which takes a `range` but has -the kernel function take an `item`. - -Feel free to use either the buffer/accessor model and feel free to use any -method of synchronization and copy back. - -When using an `item` you cannot pass this directly to the subscript operator of -an `accessor` you have to retrieve the `id`, you can do this by calling the -`get_id` member function. - -### 2.) Enqueue an ND range kernel - -Using the application from any exercise so far or creating a new one, enqueue an -ND range kernel function using the `parallel_for` variant which takes an -`nd_range`. - -Remember an `nd_range` is made up of two `range`s, the first being the global -range and the second being the local range or the work-group size. - -Remember that when using this variant of `parallel_for` the kernel function -takes an `nd_item`. - -Similarly to to the `item` when using the `nd_item` you cannot pass this -directly to the subscript operator of an `accessor`, you can retrieve the `id` -by calling the `get_global_id` member function. - -Feel free to use either the buffer/accessor model and feel free to use any -method of synchronization and copy back. - -## Build and execution hints - -For DevCloud via JupiterLab follow these [instructions](../devcloudJupyter.md). - -For DPC++: [instructions](../dpcpp.md). - -For AdaptiveCpp: [instructions](../adaptivecpp.md). diff --git a/Code_Exercises/ND_Range_Kernel/solution.cpp b/Code_Exercises/ND_Range_Kernel/solution.cpp deleted file mode 100644 index 24f1c746..00000000 --- a/Code_Exercises/ND_Range_Kernel/solution.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/* - SYCL Academy (c) - - SYCL Academy is licensed under a Creative Commons - Attribution-ShareAlike 4.0 International License. - - You should have received a copy of the license along with this - work. If not, see . -*/ - -#include - -#include "../helpers.hpp" - -class vector_add_1; -class vector_add_2; - -void test_item() { - constexpr size_t dataSize = 1024; - - int a[dataSize], b[dataSize], r[dataSize]; - for (int i = 0; i < dataSize; ++i) { - a[i] = i; - b[i] = i; - r[i] = 0; - } - - try { - auto gpuQueue = sycl::queue{sycl::gpu_selector_v}; - - auto bufA = sycl::buffer{a, sycl::range{dataSize}}; - auto bufB = sycl::buffer{b, sycl::range{dataSize}}; - auto bufR = sycl::buffer{r, sycl::range{dataSize}}; - - gpuQueue.submit([&](sycl::handler& cgh) { - sycl::accessor accA{bufA, cgh, sycl::read_only}; - sycl::accessor accB{bufB, cgh, sycl::read_only}; - sycl::accessor accR{bufR, cgh, sycl::write_only}; - - cgh.parallel_for( - sycl::range{dataSize}, [=](sycl::item<1> itm) { - auto globalId = itm.get_id(); - accR[globalId] = accA[globalId] + accB[globalId]; - }); - }); - - gpuQueue.throw_asynchronous(); - } catch (const sycl::exception& e) { - std::cout << "Exception caught: " << e.what() << std::endl; - } - - SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; }); -} - -void test_nd_item() { - constexpr size_t dataSize = 1024; - constexpr size_t workGroupSize = 128; - - int a[dataSize], b[dataSize], r[dataSize]; - for (int i = 0; i < dataSize; ++i) { - a[i] = i; - b[i] = i; - r[i] = 0; - } - - try { - auto gpuQueue = sycl::queue{sycl::gpu_selector_v}; - - auto bufA = sycl::buffer{a, sycl::range{dataSize}}; - auto bufB = sycl::buffer{b, sycl::range{dataSize}}; - auto bufR = sycl::buffer{r, sycl::range{dataSize}}; - - gpuQueue.submit([&](sycl::handler& cgh) { - sycl::accessor accA{bufA, cgh, sycl::read_write}; - sycl::accessor accB{bufB, cgh, sycl::read_write}; - sycl::accessor accR{bufR, cgh, sycl::read_write}; - - auto ndRange = - sycl::nd_range{sycl::range{dataSize}, sycl::range{workGroupSize}}; - - cgh.parallel_for(ndRange, [=](sycl::nd_item<1> itm) { - auto globalId = itm.get_global_id(); - accR[globalId] = accA[globalId] + accB[globalId]; - }); - }); - - gpuQueue.throw_asynchronous(); - } catch (const sycl::exception& e) { - std::cout << "Exception caught: " << e.what() << std::endl; - } - - SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; }); -} - -int main() { - test_item(); - test_nd_item(); -} diff --git a/Code_Exercises/ND_Range_Kernel/source.cpp b/Code_Exercises/ND_Range_Kernel/source.cpp deleted file mode 100644 index 7ce162ea..00000000 --- a/Code_Exercises/ND_Range_Kernel/source.cpp +++ /dev/null @@ -1,75 +0,0 @@ -/* - SYCL Academy (c) - - SYCL Academy is licensed under a Creative Commons - Attribution-ShareAlike 4.0 International License. - - You should have received a copy of the license along with this - work. If not, see . - - * SYCL Quick Reference - * ~~~~~~~~~~~~~~~~~~~~ - * - * // Default construct a queue - * auto q = sycl::queue{}; - * - * // Construct an in-order queue - * auto q = sycl::queue{sycl::default_selector_v, - * {sycl::property::queue::in_order{}}}; - * - * // Declare a buffer pointing to ptr - * auto buf = sycl::buffer{ptr, sycl::range{n}}; - * - * // Do a USM memcpy - * auto event = q.memcpy(dst_ptr, src_ptr, sizeof(T)*n); - * // Do a USM memcpy with dependent events - * auto event = q.memcpy(dst_ptr, src_ptr, sizeof(T)*n, {event1, event2}); - * - * // Wait on an event - * event.wait(); - * - * // Wait on a queue - * q.wait(); - * - * // Submit work to the queue - * auto event = q.submit([&](sycl::handler &cgh) { - * // COMMAND GROUP - * }); - * - * - * // Within the command group you can - * // 1. Declare an accessor to a buffer - * auto read_write_acc = sycl::accessor{buf, cgh}; - * auto read_acc = sycl::accessor{buf, cgh, sycl::read_only}; - * auto write_acc = sycl::accessor{buf, cgh, sycl::write_only}; - * auto no_init_acc = sycl::accessor{buf, cgh, sycl::no_init}; - * // 2. Enqueue a parallel for: - * // i: With range: - * cgh.parallel_for(sycl::range{n}, - * [=](sycl::id<1> i) { // Do something }); - * // ii: With nd_range: - * cgh.parallel_for(sycl::nd_range{ - * globalRange, localRange}, [=](sycl::nd_item<1> i) { - * // Do something - * }); -*/ - -#include "../helpers.hpp" - -int main() { - constexpr size_t dataSize = 1024; - - int a[dataSize], b[dataSize], r[dataSize]; - for (int i = 0; i < dataSize; ++i) { - a[i] = i; - b[i] = i; - r[i] = 0; - } - - // Task: parallelise the vector add kernel using nd_range - for (int i = 0; i < dataSize; ++i) { - r[i] = a[i] + b[i]; - } - - SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; }); -} diff --git a/Lesson_Materials/Data_Parallelism/index.html b/Lesson_Materials/Data_Parallelism/index.html index c1018916..2f2e571e 100644 --- a/Lesson_Materials/Data_Parallelism/index.html +++ b/Lesson_Materials/Data_Parallelism/index.html @@ -153,6 +153,40 @@ + +
+
+ #### SYCL execution model +
+
+
+ * SYCL kernel functions are invoked within an **nd-range** + * An nd-range has a number of work-groups and subsequently a number of work-items + * Work-groups always have the same number of work-items +
+
+ ![ND-Range](../common-revealjs/images/ndrange.png "ND-Range") +
+
+
+ +
+
+ #### SYCL execution model +
+
+
+ * The nd-range describes an **iteration space**: how it is composed in terms of work-groups and work-items + * An nd-range can be 1, 2 or 3 dimensions + * An nd-range has two components + * The **global-range** describes the total number of work-items in each dimension + * The **local-range** describes the number of work-items in a work-group in each dimension +
+
+ ![ND-Range](../common-revealjs/images/ndrange-example.png "ND-Range") +
+
+
@@ -160,11 +194,100 @@
- * Work-items are launched in parallel in a `sycl::range`. - * In order to maximize parallelism, the range should correspond to the problem size. + * Each invocation in the iteration space of an nd-range is a work-item + * Each invocation knows which work-item it is on and can query certain information about its position in the nd-range + * Each work-item has the following: + * **Global range**: {12, 12} + * **Global id**: {5, 6} + * **Group range**: {3, 3} + * **Group id**: {1, 1} + * **Local range**: {4, 4} + * **Local id**: {1, 2} +
+
+ ![ND-Range](../common-revealjs/images/ndrange-example-work-item.png "ND-Range") +
+
+
+ +
+
+ #### SYCL execution model +
+
+
+ Typically an nd-range invocation SYCL will execute the SYCL kernel function on a very large number of work-items, often in the thousands +
+
+ ![ND-Range](../common-revealjs/images/ndrange-invocation.png "ND-Range") +
+
+
+ +
+
+ #### SYCL execution model +
+
+
+ * Multiple work-items will generally execute concurrently + * On vector hardware this is often done in lock-step, which means the same hardware instructions + * The number of work-items that will execute concurrently can vary from one device to another + * Work-items will be batched along with other work-items in the same work-group + * The order work-items and work-groups are executed in is implementation defined +
+
+ ![ND-Range](../common-revealjs/images/ndrange-lock-step.png "ND-Range") +
+
+
+ +
+
+ #### SYCL execution model +
+
+
+ * Work-items in a work-group can be synchronized using a work-group barrier + * All work-items within a work-group must reach the barrier before any can continue on +
+
+ ![ND-Range](../common-revealjs/images/work-group-0.png "ND-Range") +
+
+
+ +
+
+ #### SYCL execution model +
+
+
+ * SYCL does not support synchronizing across all work-items in the nd-range + * The only way to do this is to split the computation into separate SYCL kernel functions
- ![Work-Group](../common-revealjs/images/SYCL_range.png "Work-Group") + ![ND-Range](../common-revealjs/images/work-group-0-1.png "ND-Range") +
+
+
+ +
+
+ #### SYCL execution model +
+
+
+ * SYCL also provides a simplified execution model with `sycl::range` in place of `sycl::nd_range` + * Caller only provides the global range + * Local range is decided by the runtime and cannot be inspected + * No synchronization is possible between work items + * Useful for simple problems which don't require synchronization, local memory and ultimate performance + * Runtime may not always have enough information to choose the best-performing size +
+
+ ND-Range
+ SYCL-Range
@@ -177,8 +300,8 @@
-cgh.parallel_for<my_kernel>(range{64, 64}, 
-                          [=](id<2> idx){
+cgh.parallel_for<my_kernel>(nd_range{{1024, 16}, {32, 4}},
+                          [=](nd_item<2> item){
   // SYCL kernel function is executed 
   // on a range of work-items
 });
@@ -187,74 +310,207 @@
 					
* In SYCL, kernel functions can be enqueued to execute - over a range of work-items using `parallel_for`. - * When using `parallel_for` you must also pass `range` - which describes the iteration space over which the kernel - is to be executed. + over a range of work-items using `parallel_for` + * The first argument to `parallel_for` is an `nd_range` or + a `range` which describes the iteration space over which + the kernel is to be executed + * The kernel function has to take an `nd_item` or `item`, + respectively, as the parameter (or any type they can be + implicitly converted to, commonly from `item` to `id`)
- +
- #### Parallel_for + #### Expressing parallelism
-
-
-cgh.parallel_for<my_kernel>(range{64, 64}, 
-                          [=](id<2> idx){
-  // SYCL kernel function is executed 
-  // on a range of work-items
+						
+
+
+cgh.parallel_for<kernel>((nd_range<1>{1024,32},
+  [=](nd_item<1> ndItem){
+    /* kernel function code */
+    id globalId = ndItem.get_global_id();
+    id localId = ndItem.get_local_id();
 });
-							
+
+
+
+ * Overload taking an `nd_range` object specifies the global and local range + * An `nd_item` parameter represents the global and local range and index +
+
+
+cgh.parallel_for<kernel>(range<1>{1024},
+  [=](item<1> item){
+    /* kernel function code */
+    id globalId = item.get_id();
+});
+								
+
+
+ * Overload taking a `range` object specifies the global range, runtime decides local range + * An `item` parameter represents the global range and the index within the global range +
+
+
+cgh.parallel_for<kernel>(range<1>{1024},
+  [=](id<1> globalId){
+    /* kernel function code */
+});
+								
+
+
+ * Overload taking a `range` object specifies the global range, runtime decides local range + * An `id` parameter represents the index within the global range +
-
- * When using `parallel_for` you must also have the - function object which represents the kernel function take - an `id`. - * This represents the current work-item being executed and - its position within the iteration space. + + +
+
+ #### SYCL memory model +
+
+
+ * Each work-item can access a dedicated region of **private memory** + * A work-item cannot access the private memory of another work-item +
+
+ ![Private Memory](../common-revealjs/images/workitem-privatememory.png "Private Memory") +
+ +
+
+ +
+
+ #### SYCL memory model +
+
+
+ ![Local Memory](../common-revealjs/images/workitem-localmemory.png "Local Memory") +
+
+ * Each work-item can access a dedicated region of **local memory** accessible to all work-items in a work-group + * A work-item cannot access the local memory of another work-group +
- #### Expressing parallelism + #### SYCL memory model
-
-
-							
-cgh.parallel_for<kernel>(range<1>(1024), 
-  [=](id<1> idx){
-    /* kernel function code */
-});
-							
+
+ ![Global Memory](../common-revealjs/images/workitem-constantmemory.png "Global Memory") +
+
+ * Each work-item can access a single region of **global memory** that's accessible to all work-items in a ND-range +
+ +
+
+ +
+
+ #### SYCL memory model +
+
+
+ * Each memory region has a different size and access latency + * Global memory is larger than local memory and local memory is larger than private memory + * Private memory is faster than local memory and local memory is faster than global memory +
+
+ ![Memory Regions](../common-revealjs/images/memory-regions.png "Memory Regions") +
+
+
+ +
+
+ #### Accessing Data With Accessors +
+
+ * There are a few different ways to access the data represented by an accessor + * The subscript operator can take an **id** + * Must be the same dimensionality of the accessor + * For dimensions > 1, linear address is calculated in row major + * Nested subscript operators can be called for each dimension taking a **size_t** + * E.g. a 3-dimensional accessor: acc[x][y][z] = … + * A pointer to memory can be retrieved by calling **get_pointer** + * This returns a raw pointer to the data +
+
+ +
+
+ #### Accessing Data With Accessors +
+
+
-							
-cgh.parallel_for<kernel>(range<1>(1024), 
-  [=](item<1> item){
-    /* kernel function code */
+buffer<float, 1> bufA(dA.data(), range<1>(dA.size()));
+buffer<float, 1> bufB(dB.data(), range<1>(dB.size()));
+buffer<float, 1> bufO(dO.data(), range<1>(dO.size()));
+
+gpuQueue.submit([&](handler &cgh){
+  sycl::accessor inA{bufA, cgh, sycl::read_only};
+  sycl::accessor inB{bufB, cgh, sycl::read_only};
+  sycl::accessor out{bufO, cgh, sycl::write_only};
+  cgh.parallel_for<add>(range<1>(dA.size()),
+    [=](id<1> i){
+    out[i] = inA[i] + inB[i];
+  });
 });
 							
+
+
+ * Here we access the data of the `accessor` by + passing in the `id` passed to the SYCL kernel + function. +
+
+
+ +
+
+ #### Accessing Data With Accessors +
+
+
-							
-cgh.parallel_for<kernel>(nd_range<1>(range<1>(1024), 
-  range<1>(32)),[=](nd_item<1> ndItem){
-    /* kernel function code */
+buffer<float, 1> bufA(dA.data(), range<1>(dA.size()));
+buffer<float, 1> bufB(dB.data(), range<1>(dB.size()));
+buffer<float, 1> bufO(dO.data(), range<1>(dO.size()));
+
+gpuQueue.submit([&](handler &cgh){
+  sycl::accessor inA{bufA, cgh, sycl::read_only};
+  sycl::accessor inB{bufB, cgh, sycl::read_only};
+  sycl::accessor out{bufO, cgh, sycl::write_only};
+  cgh.parallel_for<add>(rng, [=](item<3> i){
+    auto ptrA = inA.get_pointer();
+    auto ptrB = inB.get_pointer();
+    auto ptrO = out.get_pointer();
+    auto linearId = i.get_linear_id();
+
+    ptrA[linearId] = ptrB[linearId] + ptrO[linearId]; 
+  });
 });
 							
-
- * Overload taking a **range** object specifies the global range, runtime decides local range - * An **id** parameter represents the index within the global range - ____________________________________________________________________________________________ - * Overload taking a **range** object specifies the global range, runtime decides local range - * An **item** parameter represents the global range and the index within the global range - ____________________________________________________________________________________________ - * Overload taking an **nd_range** object specifies the global and local range - * An **nd_item** parameter represents the global and local range and index +
+ * Here we retrieve the underlying pointer for each + of the `accessor`s. + * We then access the pointer using the linearized + `id` by calling the `get_linear_id` member function + on the `item`. + * Again this linearization is calculated in + row-major order.
@@ -271,8 +527,11 @@ Code_Exercises/Data_Parallelism/source.cpp
- Implement a SYCL application that adds two arrays of - values together in parallel using `parallel_for`. + Implement a SYCL application using `parallel_for` to add two arrays of values +
+
+ * Use buffers and accessors to manage data + * Try the `sycl::range` and `sycl::nd_range` variants
diff --git a/Lesson_Materials/ND_Range_Kernel/index.html b/Lesson_Materials/ND_Range_Kernel/index.html deleted file mode 100644 index 2c50d585..00000000 --- a/Lesson_Materials/ND_Range_Kernel/index.html +++ /dev/null @@ -1,431 +0,0 @@ - - - - - - - - - - - - - -
-
-
- - - - -
- -
- ## ND Range Kernels -
- -
- ## Learning Objectives - * Learn about the SYCL execution and memory model - * Learn how to enqueue an nd-range kernel function -
- -
-
- #### SYCL execution model -
-
-
- * SYCL kernel functions are executed by **work-items** - * You can think of a work-item as a thread of execution - * Each work-item will execute a SYCL kernel function from start to end - * A work-item can run on CPU threads, SIMD lanes, GPU threads, or any other kind of processing element -
-
- ![Work-Item](../common-revealjs/images/workitem.png "Work-Item") -
- -
-
- -
-
- #### SYCL execution model -
-
-
- * Work-items are collected together into **work-groups** - * The size of work-groups is generally relative to what is optimal on the device being targeted - * It can also be affected by the resources used by each work-item -
-
- ![Work-Group](../common-revealjs/images/workgroup.png "Work-Group") -
-
-
- -
-
- #### SYCL execution model -
-
-
- * SYCL kernel functions are invoked within an **nd-range** - * An nd-range has a number of work-groups and subsequently a number of work-items - * Work-groups always have the same number of work-items -
-
- ![ND-Range](../common-revealjs/images/ndrange.png "ND-Range") -
-
-
- -
-
- #### SYCL execution model -
-
-
- * The nd-range describes an **iteration space**: how it is composed in terms of work-groups and work-items - * An nd-range can be 1, 2 or 3 dimensions - * An nd-range has two components - * The **global-range** describes the total number of work-items in each dimension - * The **local-range** describes the number of work-items in a work-group in each dimension -
-
- ![ND-Range](../common-revealjs/images/ndrange-example.png "ND-Range") -
-
-
- -
-
- #### SYCL execution model -
-
-
- * Each invocation in the iteration space of an nd-range is a work-item - * Each invocation knows which work-item it is on and can query certain information about its position in the nd-range - * Each work-item has the following: - * **Global range**: {12, 12} - * **Global id**: {5, 6} - * **Group range**: {3, 3} - * **Group id**: {1, 1} - * **Local range**: {4, 4} - * **Local id**: {1, 2} -
-
- ![ND-Range](../common-revealjs/images/ndrange-example-work-item.png "ND-Range") -
-
-
- -
-
- #### SYCL execution model -
-
-
- Typically an nd-range invocation SYCL will execute the SYCL kernel function on a very large number of work-items, often in the thousands -
-
- ![ND-Range](../common-revealjs/images/ndrange-invocation.png "ND-Range") -
-
-
- -
-
- #### SYCL execution model -
-
-
- * Multiple work-items will generally execute concurrently - * On vector hardware this is often done in lock-step, which means the same hardware instructions - * The number of work-items that will execute concurrently can vary from one device to another - * Work-items will be batched along with other work-items in the same work-group - * The order work-items and work-groups are executed in is implementation defined -
-
- ![ND-Range](../common-revealjs/images/ndrange-lock-step.png "ND-Range") -
-
-
- -
-
- #### SYCL execution model -
-
-
- * Work-items in a work-group can be synchronized using a work-group barrier - * All work-items within a work-group must reach the barrier before any can continue on -
-
- ![ND-Range](../common-revealjs/images/work-group-0.png "ND-Range") -
-
-
- -
-
- #### SYCL execution model -
-
-
- * SYCL does not support synchronizing across all work-items in the nd-range - * The only way to do this is to split the computation into separate SYCL kernel functions -
-
- ![ND-Range](../common-revealjs/images/work-group-0-1.png "ND-Range") -
-
-
- -
-
- #### SYCL memory model -
-
-
- * Each work-item can access a dedicated region of **private memory** - * A work-item cannot access the private memory of another work-item -
-
- ![Private Memory](../common-revealjs/images/workitem-privatememory.png "Private Memory") -
- -
-
- -
-
- #### SYCL memory model -
-
-
- ![Local Memory](../common-revealjs/images/workitem-localmemory.png "Local Memory") -
-
- * Each work-item can access a dedicated region of **local memory** accessible to all work-items in a work-group - * A work-item cannot access the local memory of another work-group -
-
-
- -
-
- #### SYCL memory model -
-
-
- ![Constant Memory](../common-revealjs/images/workitem-constantmemory.png "Constant Memory") -
-
- * Each work-item can access a single region of **global memory** that's accessible to all work-items in a ND-range - * Each work-item can also access a region of global memory reserved as **constant memory**, which is read-only -
- -
-
- -
-
- #### SYCL memory model -
-
-
- * Each memory region has a different size and access latency - * Global / constant memory is larger than local memory and local memory is larger than private memory - * Private memory is faster than local memory and local memory is faster than global / constant memory -
-
- ![Memory Regions](../common-revealjs/images/memory-regions.png "Memory Regions") -
-
-
- -
-
- #### Expressing parallelism -
-
-
-
-							
-cgh.parallel_for<kernel>(range<1>(1024), 
-  [=](id<1> idx){
-    /* kernel function code */
-});
-							
-
-							
-cgh.parallel_for<kernel>(range<1>(1024), 
-  [=](item<1> item){
-    /* kernel function code */
-});
-							
-
-							
-cgh.parallel_for<kernel>(nd_range<1>(range<1>(1024), 
-  range<1>(32)),[=](nd_item<1> ndItem){
-    /* kernel function code */
-});
-							
-
-
- * Overload taking a **range** object specifies the global range, runtime decides local range - * An **id** parameter represents the index within the global range - ____________________________________________________________________________________________ - * Overload taking a **range** object specifies the global range, runtime decides local range - * An **item** parameter represents the global range and the index within the global range - ____________________________________________________________________________________________ - * Overload taking an **nd_range** object specifies the global and local range - * An **nd_item** parameter represents the global and local range and index -
-
-
- -
-
- #### Accessing Data With Accessors -
-
- * There are a few different ways to access the data represented by an accessor - * The subscript operator can take an **id** - * Must be the same dimensionality of the accessor - * For dimensions > 1, linear address is calculated in row major - * Nested subscript operators can be called for each dimension taking a **size_t** - * E.g. a 3-dimensional accessor: acc[x][y][z] = … - * A pointer to memory can be retrieved by calling **get_pointer** - * This returns a raw pointer to the data -
-
- -
-
- #### Accessing Data With Accessors -
-
-
-
-buffer<float, 1> bufA(dA.data(), range<1>(dA.size())); 
-buffer<float, 1> bufB(dB.data(), range<1>(dB.size())); 
-buffer<float, 1> bufO(dO.data(), range<1>(dO.size()));
-
-gpuQueue.submit([&](handler &cgh){
-  sycl::accessor inA{bufA, cgh, sycl::read_only};
-  sycl::accessor inB{bufB, cgh, sycl::read_only};
-  sycl::accessor out{bufO, cgh, sycl::write_only};
-  cgh.parallel_for<add>(range<1>(dA.size()), 
-    [=](id<1> i){ 
-    out[i] = inA[i] + inB[i];
-  });
-});
-							
-
-
- * Here we access the data of the `accessor` by - passing in the `id` passed to the SYCL kernel - function. -
-
-
- -
-
- #### Accessing Data With Accessors -
-
-
-
-buffer<float, 1> bufA(dA.data(), range<1>(dA.size())); 
-buffer<float, 1> bufB(dB.data(), range<1>(dB.size())); 
-buffer<float, 1> bufO(dO.data(), range<1>(dO.size()));
-
-gpuQueue.submit([&](handler &cgh){
-  sycl::accessor inA{bufA, cgh, sycl::read_only};
-  sycl::accessor inB{bufB, cgh, sycl::read_only};
-  sycl::accessor out{bufO, cgh, sycl::write_only};
-  cgh.parallel_for<add>(rng, [=](item<3> i){
-    auto ptrA = inA.get_pointer();
-    auto ptrB = inB.get_pointer();
-    auto ptrO = out.get_pointer();
-    auto linearId = i.get_linear_id();
-
-    ptrA[linearId] = ptrB[linearId] + ptrO[linearId]; 
-  });
-});
-							
-
-
- * Here we retrieve the underlying pointer for each - of the `accessor`s. - * We then access the pointer using the linearized - `id` by calling the `get_linear_id` member function - on the `item`. - * Again this linearization is calculated in - row-major order. -
-
-
- -
- ## Questions -
- -
-
- #### Exercise -
-
- Code_Exercises/ND_Range_Kernel/source -
-
- Implement a SYCL application that will perform a vector add using `parallel_for`, adding multiple elements in parallel. -
-
-
-
- - - - - - - diff --git a/README.md b/README.md index aae8a35b..74c0776b 100644 --- a/README.md +++ b/README.md @@ -85,15 +85,14 @@ may not match completely. | 11 | In Order Queue | [slides][lesson-11-slides] | [exercise][lesson-11-exercise] | [source][lesson-11-source] | [solution][lesson-11-solution] | Yes | Yes | | 12 | Advanced Data Flow | [slides][lesson-12-slides] | [exercise][lesson-12-exercise] | [source][lesson-12-source] | [solution][lesson-12-solution] | Yes | Yes | | 13 | Multiple Devices | [slides][lesson-13-slides] | [exercise][lesson-13-exercise] | [source][lesson-13-source] | [solution][lesson-13-solution] | Yes | Yes | -| 14 | ND Range Kernels | [slides][lesson-14-slides] | [exercise][lesson-14-exercise] | [source][lesson-14-source] | [solution][lesson-14-solution] | Yes | Yes | -| 15 | Image Convolution | [slides][lesson-15-slides] | [exercise][lesson-15-exercise] | | [solution][lesson-15-solution] | Yes | Yes | -| 16 | Coalesced Global Memory | [slides][lesson-16-slides] | [exercise][lesson-16-exercise] | [source][lesson-16-source] | [solution][lesson-16-solution] | Yes | Yes | -| 17 | Vectors | [slides][lesson-17-slides] | [exercise][lesson-17-exercise] | [source][lesson-17-source] | [solution][lesson-17-solution] | Yes | Yes | -| 18 | Local Memory Tiling | [slides][lesson-18-slides] | [exercise][lesson-18-exercise] | [source][lesson-18-source] | [solution][lesson-18-solution] | Yes | Yes | -| 19 | Further Optimisations | [slides][lesson-19-slides] | [exercise][lesson-19-exercise] | [source][lesson-19-source] | [solution][lesson-19-solution] | Yes | Yes | -| 20 | Matrix Transpose | [slides][lesson-20-slides] | [exercise][lesson-20-exercise] | [source][lesson-20-source] | [solution][lesson-20-solution] | Yes | Yes | -| 21 | More SYCL Features | [slides][lesson-21-slides] | [exercise][lesson-21-exercise] | [source][lesson-21-source] | [solution][lesson-21-solution] | Yes | Yes | -| 22 | Functors | [slides][lesson-22-slides] | [exercise][lesson-22-exercise] | [source][lesson-22-source] | [solution][lesson-22-solution] | Yes | Yes | +| 14 | Image Convolution | [slides][lesson-14-slides] | [exercise][lesson-14-exercise] | [source][lesson-14-source] | [solution][lesson-14-solution] | Yes | Yes | +| 15 | Coalesced Global Memory | [slides][lesson-15-slides] | [exercise][lesson-15-exercise] | | [solution][lesson-15-solution] | Yes | Yes | +| 16 | Vectors | [slides][lesson-16-slides] | [exercise][lesson-16-exercise] | [source][lesson-16-source] | [solution][lesson-16-solution] | Yes | Yes | +| 17 | Local Memory Tiling | [slides][lesson-17-slides] | [exercise][lesson-17-exercise] | [source][lesson-17-source] | [solution][lesson-17-solution] | Yes | Yes | +| 18 | Further Optimisations | [slides][lesson-18-slides] | [exercise][lesson-18-exercise] | [source][lesson-18-source] | [solution][lesson-18-solution] | Yes | Yes | +| 19 | Matrix Transpose | [slides][lesson-19-slides] | [exercise][lesson-19-exercise] | [source][lesson-19-source] | [solution][lesson-19-solution] | Yes | Yes | +| 20 | More SYCL Features | [slides][lesson-20-slides] | [exercise][lesson-20-exercise] | [source][lesson-20-source] | [solution][lesson-20-solution] | Yes | Yes | +| 21 | Functors | [slides][lesson-21-slides] | [exercise][lesson-21-exercise] | [source][lesson-21-source] | [solution][lesson-21-solution] | Yes | Yes | ## Building the Exercises @@ -418,47 +417,42 @@ cmake ../ "-GUnix Makefiles" -DSYCL_ACADEMY_USE_DPCPP=ON -DSYCL_ACADEMY_ENABLE_S [lesson-13-source]: ./Code_Exercises/Multiple_Devices/source.cpp [lesson-13-solution]: ./Code_Exercises/Multiple_Devices/solution.cpp -[lesson-14-slides]: ./Lesson_Materials/ND_Range_Kernel/ -[lesson-14-exercise]: ./Code_Exercises/ND_Range_Kernel/README.md -[lesson-14-source]: ./Code_Exercises/ND_Range_Kernel/source.cpp -[lesson-14-solution]: ./Code_Exercises/ND_Range_Kernel/solution.cpp - -[lesson-15-slides]: ./Lesson_Materials/Image_Convolution/ -[lesson-15-exercise]: ./Code_Exercises/Image_Convolution/README.md -[lesson-15-source]: ./Code_Exercises/Image_Convolution/source.cpp -[lesson-15-solution]: ./Code_Exercises/Image_Convolution/reference.cpp - -[lesson-16-slides]: ./Lesson_Materials/Coalesced_Global_Memory/ -[lesson-16-exercise]: ./Code_Exercises/Coalesced_Global_Memory/README.md -[lesson-16-source]: ./Code_Exercises/Coalesced_Global_Memory/source.cpp -[lesson-16-solution]: ./Code_Exercises/Coalesced_Global_Memory/solution.cpp - -[lesson-17-slides]: ./Lesson_Materials/Vectors/ -[lesson-17-exercise]: ./Code_Exercises/Vectors/README.md -[lesson-17-source]: ./Code_Exercises/Vectors/source.cpp -[lesson-17-solution]: ./Code_Exercises/Vectors/solution.cpp - -[lesson-18-slides]: ./Lesson_Materials/Local_Memory_Tiling/ -[lesson-18-exercise]: ./Code_Exercises/Local_Memory_Tiling/README.md -[lesson-18-source]: ./Code_Exercises/Local_Memory_Tiling/source.cpp -[lesson-18-solution]: ./Code_Exercises/Local_Memory_Tiling/solution.cpp - -[lesson-19-slides]: ./Lesson_Materials/Work_Group_Sizes/ -[lesson-19-exercise]: ./Code_Exercises/Work_Group_Sizes/README.md -[lesson-19-source]: ./Code_Exercises/Work_Group_Sizes/source.cpp -[lesson-19-solution]: ./Code_Exercises/Work_Group_Sizes/solution.cpp - -[lesson-20-slides]: ./Lesson_Materials/Matrix_Transpose/ -[lesson-20-exercise]: ./Code_Exercises/Matrix_Transpose/README.md -[lesson-20-source]: ./Code_Exercises/Matrix_Transpose/source.cpp -[lesson-20-solution]: ./Code_Exercises/Matrix_Transpose/solution.cpp - -[lesson-21-slides]: ./Lesson_Materials/More_SYCL_Features/ -[lesson-21-exercise]: ./Code_Exercises/More_SYCL_Features/README.md -[lesson-21-source]: ./Code_Exercises/More_SYCL_Features/source.cpp -[lesson-21-solution]: ./Code_Exercises/More_SYCL_Features/solution.cpp - -[lesson-22-slides]: ./Lesson_Materials/Fast_Track/ -[lesson-22-exercise]: ./Code_Exercises/Functors/README.md -[lesson-22-source]: ./Code_Exercises/Functors/source.cpp -[lesson-22-solution]: ./Code_Exercises/Functors/solution.cpp +[lesson-14-slides]: ./Lesson_Materials/Image_Convolution/ +[lesson-14-exercise]: ./Code_Exercises/Image_Convolution/README.md +[lesson-14-source]: ./Code_Exercises/Image_Convolution/source.cpp +[lesson-14-solution]: ./Code_Exercises/Image_Convolution/reference.cpp + +[lesson-15-slides]: ./Lesson_Materials/Coalesced_Global_Memory/ +[lesson-15-exercise]: ./Code_Exercises/Coalesced_Global_Memory/README.md +[lesson-15-source]: ./Code_Exercises/Coalesced_Global_Memory/source.cpp +[lesson-15-solution]: ./Code_Exercises/Coalesced_Global_Memory/solution.cpp + +[lesson-16-slides]: ./Lesson_Materials/Vectors/ +[lesson-16-exercise]: ./Code_Exercises/Vectors/README.md +[lesson-16-source]: ./Code_Exercises/Vectors/source.cpp +[lesson-16-solution]: ./Code_Exercises/Vectors/solution.cpp + +[lesson-17-slides]: ./Lesson_Materials/Local_Memory_Tiling/ +[lesson-17-exercise]: ./Code_Exercises/Local_Memory_Tiling/README.md +[lesson-17-source]: ./Code_Exercises/Local_Memory_Tiling/source.cpp +[lesson-17-solution]: ./Code_Exercises/Local_Memory_Tiling/solution.cpp + +[lesson-18-slides]: ./Lesson_Materials/Work_Group_Sizes/ +[lesson-18-exercise]: ./Code_Exercises/Work_Group_Sizes/README.md +[lesson-18-source]: ./Code_Exercises/Work_Group_Sizes/source.cpp +[lesson-18-solution]: ./Code_Exercises/Work_Group_Sizes/solution.cpp + +[lesson-19-slides]: ./Lesson_Materials/Matrix_Transpose/ +[lesson-19-exercise]: ./Code_Exercises/Matrix_Transpose/README.md +[lesson-19-source]: ./Code_Exercises/Matrix_Transpose/source.cpp +[lesson-19-solution]: ./Code_Exercises/Matrix_Transpose/solution.cpp + +[lesson-20-slides]: ./Lesson_Materials/More_SYCL_Features/ +[lesson-20-exercise]: ./Code_Exercises/More_SYCL_Features/README.md +[lesson-20-source]: ./Code_Exercises/More_SYCL_Features/source.cpp +[lesson-20-solution]: ./Code_Exercises/More_SYCL_Features/solution.cpp + +[lesson-21-slides]: ./Lesson_Materials/Fast_Track/ +[lesson-21-exercise]: ./Code_Exercises/Functors/README.md +[lesson-21-source]: ./Code_Exercises/Functors/source.cpp +[lesson-21-solution]: ./Code_Exercises/Functors/solution.cpp