Merge the ND-Range Kernel lesson into Data Parallelism (#388)

The Data Parallelism and ND-Range Kernel lessons duplicated a lot of material, which became confusing when we tried presenting both in a workshop. The duplication is unnecessary and also adds to the maintenance cost. Merge the "ND-Range Kernel" lesson into "Data Parallelism" by moving all the unique slides from the latter into the former and reordering so it has a reasonable flow. The exercises were also similar, where "Data Parallelism" tested sycl::range + sycl::id version and ND-Range tested sycl::range + sycl::item as well as sycl::nd_range + sycl::nd_item. Merge the two by using most of the code from the latter, but using sycl::id instead of sycl::item. Merge the README instructions from both to retain full detail. The top-level README is adjusted to renumber all lessons after the removed one. Constant memory is deprecated in SYCL 2020, so removed it from the slides about the SYCL memory model.
codeplaysoftware · Jan 15, 2025 · f1091a6 · f1091a6
1 parent bc79770
commit f1091a6
Show file tree

Hide file tree

Showing 11 changed files with 435 additions and 797 deletions.
diff --git a/Code_Exercises/CMakeLists.txt b/Code_Exercises/CMakeLists.txt
@@ -58,7 +58,6 @@ add_subdirectory(Data_and_Dependencies)
 add_subdirectory(In_Order_Queue)
 add_subdirectory(Advanced_Data_Flow)
 add_subdirectory(Multiple_Devices)
-add_subdirectory(ND_Range_Kernel)
 add_subdirectory(Image_Convolution)
 add_subdirectory(Coalesced_Global_Memory)
 add_subdirectory(Vectors)

diff --git a/Code_Exercises/Data_Parallelism/README.md b/Code_Exercises/Data_Parallelism/README.md
@@ -27,13 +27,20 @@ Create `accessor`s to each of the `buffer`s within the command group function.
 
 Now enqueue parallel kernel function by calling `parallel_for` on the `handler`.
 
-This function takes a `range` specifying the number of iterations of the kernel
-function to invoke and the kernel function itself must take an `id` which
-represents the current iteration.
+#### 4.1 ) Use the `range` and `id` variant
+This version of `parallel_for` takes a `range` specifying the number of
+iterations of the kernel function to invoke and the kernel function itself must
+take an `id` which represents the current iteration.
 
 The `id` can be used in the `accessor` subscript operator to access or assign to
 the corresponding element of data that the accessor represents.
 
+#### 4.2 ) Use the `nd_range` and `nd_item` variant
+This version of `parallel_for` takes an `nd_range` which is made up of two
+`range`s describing the global range and the local range (work-group size). The
+kernel function must take an `nd_item`, which cannot be passed directly to the
+subscript operator of an `accessor`. Instead, retrieve the `id` using the
+`get_global_id` member function.
 
 #### Build And Execution Hints
 

diff --git a/Code_Exercises/Data_Parallelism/solution.cpp b/Code_Exercises/Data_Parallelism/solution.cpp
@@ -12,16 +12,54 @@
 
 #include "../helpers.hpp"
 
-class vector_add;
+class vector_add_1;
+class vector_add_2;
 
-int main() {
+void test_range() {
+  constexpr size_t dataSize = 1024;
+
+  int a[dataSize], b[dataSize], r[dataSize];
+  for (int i = 0; i < dataSize; ++i) {
+    a[i] = i;
+    b[i] = i;
+    r[i] = 0;
+  }
+
+  try {
+    auto defaultQueue = sycl::queue{};
+
+    auto bufA = sycl::buffer{a, sycl::range{dataSize}};
+    auto bufB = sycl::buffer{b, sycl::range{dataSize}};
+    auto bufR = sycl::buffer{r, sycl::range{dataSize}};
+
+    defaultQueue.submit([&](sycl::handler& cgh) {
+      sycl::accessor accA{bufA, cgh, sycl::read_only};
+      sycl::accessor accB{bufB, cgh, sycl::read_only};
+      sycl::accessor accR{bufR, cgh, sycl::write_only};
+
+      cgh.parallel_for<vector_add_1>(
+          sycl::range{dataSize}, [=](sycl::id<1> globalId) {
+            accR[globalId] = accA[globalId] + accB[globalId];
+          });
+    });
+
+    defaultQueue.throw_asynchronous();
+  } catch (const sycl::exception& e) {
+    std::cout << "Exception caught: " << e.what() << std::endl;
+  }
+
+  SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; });
+}
+
+void test_nd_range() {
   constexpr size_t dataSize = 1024;
+  constexpr size_t workGroupSize = 128;
 
-  float a[dataSize], b[dataSize], r[dataSize];
+  int a[dataSize], b[dataSize], r[dataSize];
   for (int i = 0; i < dataSize; ++i) {
-    a[i] = static_cast<float>(i);
-    b[i] = static_cast<float>(i);
-    r[i] = 0.0f;
+    a[i] = i;
+    b[i] = i;
+    r[i] = 0;
   }
 
   try {
@@ -31,22 +69,29 @@ int main() {
     auto bufB = sycl::buffer{b, sycl::range{dataSize}};
     auto bufR = sycl::buffer{r, sycl::range{dataSize}};
 
-    defaultQueue
-        .submit([&](sycl::handler& cgh) {
-          sycl::accessor accA{bufA, cgh, sycl::read_only};
-          sycl::accessor accB{bufB, cgh, sycl::read_only};
-          sycl::accessor accR{bufR, cgh, sycl::write_only};
+    defaultQueue.submit([&](sycl::handler& cgh) {
+      sycl::accessor accA{bufA, cgh, sycl::read_only};
+      sycl::accessor accB{bufB, cgh, sycl::read_only};
+      sycl::accessor accR{bufR, cgh, sycl::write_only};
+
+      auto ndRange =
+          sycl::nd_range{sycl::range{dataSize}, sycl::range{workGroupSize}};
 
-          cgh.parallel_for<vector_add>(
-              sycl::range{dataSize},
-              [=](sycl::id<1> idx) { accR[idx] = accA[idx] + accB[idx]; });
-        })
-        .wait();
+      cgh.parallel_for<vector_add_2>(ndRange, [=](sycl::nd_item<1> itm) {
+        sycl::id globalId = itm.get_global_id();
+        accR[globalId] = accA[globalId] + accB[globalId];
+      });
+    });
 
     defaultQueue.throw_asynchronous();
   } catch (const sycl::exception& e) {
     std::cout << "Exception caught: " << e.what() << std::endl;
   }
 
-  SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2.0f; });
+  SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; });
+}
+
+int main() {
+  test_range();
+  test_nd_range();
 }
diff --git a/Code_Exercises/Data_Parallelism/source.cpp b/Code_Exercises/Data_Parallelism/source.cpp
@@ -44,17 +44,17 @@
 int main() {
   constexpr size_t dataSize = 1024;
 
-  float a[dataSize], b[dataSize], r[dataSize];
+  int a[dataSize], b[dataSize], r[dataSize];
   for (int i = 0; i < dataSize; ++i) {
-    a[i] = static_cast<float>(i);
-    b[i] = static_cast<float>(i);
-    r[i] = 0.0f;
+    a[i] = i;
+    b[i] = i;
+    r[i] = 0;
   }
 
   // Task: Compute r[i] = a[i] + b[i] in parallel on the SYCL device
   for (int i = 0; i < dataSize; ++i) {
     r[i] = a[i] + b[i];
   }
 
-  SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2.0f; });
+  SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; });
 }
diff --git a/Code_Exercises/ND_Range_Kernel/CMakeLists.txt b/Code_Exercises/ND_Range_Kernel/CMakeLists.txt
diff --git a/Code_Exercises/ND_Range_Kernel/README.md b/Code_Exercises/ND_Range_Kernel/README.md
diff --git a/Code_Exercises/ND_Range_Kernel/solution.cpp b/Code_Exercises/ND_Range_Kernel/solution.cpp
diff --git a/Code_Exercises/ND_Range_Kernel/source.cpp b/Code_Exercises/ND_Range_Kernel/source.cpp