From f1091a68c1850e64baf7ab36903c7d7db4e9eaf0 Mon Sep 17 00:00:00 2001
From: Rafal Bielski <rafal.bielski@codeplay.com>
Date: Wed, 15 Jan 2025 14:53:37 +0000
Subject: [PATCH] Merge the ND-Range Kernel lesson into Data Parallelism (#388)

The Data Parallelism and ND-Range Kernel lessons duplicated a lot
of material, which became confusing when we tried presenting both
in a workshop. The duplication is unnecessary and also adds to the
maintenance cost.

Merge the "ND-Range Kernel" lesson into "Data Parallelism" by
moving all the unique slides from the latter into the former and
reordering so it has a reasonable flow. The exercises were also
similar, where "Data Parallelism" tested sycl::range + sycl::id
version and ND-Range tested sycl::range + sycl::item as well as
sycl::nd_range + sycl::nd_item. Merge the two by using most of
the code from the latter, but using sycl::id instead of sycl::item.
Merge the README instructions from both to retain full detail.

The top-level README is adjusted to renumber all lessons after the
removed one.

Constant memory is deprecated in SYCL 2020, so removed
it from the slides about the SYCL memory model.
---
 Code_Exercises/CMakeLists.txt                 |   1 -
 Code_Exercises/Data_Parallelism/README.md     |  13 +-
 Code_Exercises/Data_Parallelism/solution.cpp  |  79 +++-
 Code_Exercises/Data_Parallelism/source.cpp    |  10 +-
 Code_Exercises/ND_Range_Kernel/CMakeLists.txt |  14 -
 Code_Exercises/ND_Range_Kernel/README.md      |  48 --
 Code_Exercises/ND_Range_Kernel/solution.cpp   |  98 ----
 Code_Exercises/ND_Range_Kernel/source.cpp     |  75 ---
 Lesson_Materials/Data_Parallelism/index.html  | 363 ++++++++++++---
 Lesson_Materials/ND_Range_Kernel/index.html   | 431 ------------------
 README.md                                     | 100 ++--
 11 files changed, 435 insertions(+), 797 deletions(-)
 delete mode 100644 Code_Exercises/ND_Range_Kernel/CMakeLists.txt
 delete mode 100644 Code_Exercises/ND_Range_Kernel/README.md
 delete mode 100644 Code_Exercises/ND_Range_Kernel/solution.cpp
 delete mode 100644 Code_Exercises/ND_Range_Kernel/source.cpp
 delete mode 100644 Lesson_Materials/ND_Range_Kernel/index.html

diff --git a/Code_Exercises/CMakeLists.txt b/Code_Exercises/CMakeLists.txt
index 4e49060d..a0bc251c 100644
--- a/Code_Exercises/CMakeLists.txt
+++ b/Code_Exercises/CMakeLists.txt
@@ -58,7 +58,6 @@ add_subdirectory(Data_and_Dependencies)
 add_subdirectory(In_Order_Queue)
 add_subdirectory(Advanced_Data_Flow)
 add_subdirectory(Multiple_Devices)
-add_subdirectory(ND_Range_Kernel)
 add_subdirectory(Image_Convolution)
 add_subdirectory(Coalesced_Global_Memory)
 add_subdirectory(Vectors)
diff --git a/Code_Exercises/Data_Parallelism/README.md b/Code_Exercises/Data_Parallelism/README.md
index fafa95ef..597ff6c9 100644
--- a/Code_Exercises/Data_Parallelism/README.md
+++ b/Code_Exercises/Data_Parallelism/README.md
@@ -27,13 +27,20 @@ Create `accessor`s to each of the `buffer`s within the command group function.
 
 Now enqueue parallel kernel function by calling `parallel_for` on the `handler`.
 
-This function takes a `range` specifying the number of iterations of the kernel
-function to invoke and the kernel function itself must take an `id` which
-represents the current iteration.
+#### 4.1 ) Use the `range` and `id` variant
+This version of `parallel_for` takes a `range` specifying the number of
+iterations of the kernel function to invoke and the kernel function itself must
+take an `id` which represents the current iteration.
 
 The `id` can be used in the `accessor` subscript operator to access or assign to
 the corresponding element of data that the accessor represents.
 
+#### 4.2 ) Use the `nd_range` and `nd_item` variant
+This version of `parallel_for` takes an `nd_range` which is made up of two
+`range`s describing the global range and the local range (work-group size). The
+kernel function must take an `nd_item`, which cannot be passed directly to the
+subscript operator of an `accessor`. Instead, retrieve the `id` using the
+`get_global_id` member function.
 
 #### Build And Execution Hints
 
diff --git a/Code_Exercises/Data_Parallelism/solution.cpp b/Code_Exercises/Data_Parallelism/solution.cpp
index 2c4a17eb..efcbca62 100644
--- a/Code_Exercises/Data_Parallelism/solution.cpp
+++ b/Code_Exercises/Data_Parallelism/solution.cpp
@@ -12,16 +12,54 @@
 
 #include "../helpers.hpp"
 
-class vector_add;
+class vector_add_1;
+class vector_add_2;
 
-int main() {
+void test_range() {
+  constexpr size_t dataSize = 1024;
+
+  int a[dataSize], b[dataSize], r[dataSize];
+  for (int i = 0; i < dataSize; ++i) {
+    a[i] = i;
+    b[i] = i;
+    r[i] = 0;
+  }
+
+  try {
+    auto defaultQueue = sycl::queue{};
+
+    auto bufA = sycl::buffer{a, sycl::range{dataSize}};
+    auto bufB = sycl::buffer{b, sycl::range{dataSize}};
+    auto bufR = sycl::buffer{r, sycl::range{dataSize}};
+
+    defaultQueue.submit([&](sycl::handler& cgh) {
+      sycl::accessor accA{bufA, cgh, sycl::read_only};
+      sycl::accessor accB{bufB, cgh, sycl::read_only};
+      sycl::accessor accR{bufR, cgh, sycl::write_only};
+
+      cgh.parallel_for<vector_add_1>(
+          sycl::range{dataSize}, [=](sycl::id<1> globalId) {
+            accR[globalId] = accA[globalId] + accB[globalId];
+          });
+    });
+
+    defaultQueue.throw_asynchronous();
+  } catch (const sycl::exception& e) {
+    std::cout << "Exception caught: " << e.what() << std::endl;
+  }
+
+  SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; });
+}
+
+void test_nd_range() {
   constexpr size_t dataSize = 1024;
+  constexpr size_t workGroupSize = 128;
 
-  float a[dataSize], b[dataSize], r[dataSize];
+  int a[dataSize], b[dataSize], r[dataSize];
   for (int i = 0; i < dataSize; ++i) {
-    a[i] = static_cast<float>(i);
-    b[i] = static_cast<float>(i);
-    r[i] = 0.0f;
+    a[i] = i;
+    b[i] = i;
+    r[i] = 0;
   }
 
   try {
@@ -31,22 +69,29 @@ int main() {
     auto bufB = sycl::buffer{b, sycl::range{dataSize}};
     auto bufR = sycl::buffer{r, sycl::range{dataSize}};
 
-    defaultQueue
-        .submit([&](sycl::handler& cgh) {
-          sycl::accessor accA{bufA, cgh, sycl::read_only};
-          sycl::accessor accB{bufB, cgh, sycl::read_only};
-          sycl::accessor accR{bufR, cgh, sycl::write_only};
+    defaultQueue.submit([&](sycl::handler& cgh) {
+      sycl::accessor accA{bufA, cgh, sycl::read_only};
+      sycl::accessor accB{bufB, cgh, sycl::read_only};
+      sycl::accessor accR{bufR, cgh, sycl::write_only};
+
+      auto ndRange =
+          sycl::nd_range{sycl::range{dataSize}, sycl::range{workGroupSize}};
 
-          cgh.parallel_for<vector_add>(
-              sycl::range{dataSize},
-              [=](sycl::id<1> idx) { accR[idx] = accA[idx] + accB[idx]; });
-        })
-        .wait();
+      cgh.parallel_for<vector_add_2>(ndRange, [=](sycl::nd_item<1> itm) {
+        sycl::id globalId = itm.get_global_id();
+        accR[globalId] = accA[globalId] + accB[globalId];
+      });
+    });
 
     defaultQueue.throw_asynchronous();
   } catch (const sycl::exception& e) {
     std::cout << "Exception caught: " << e.what() << std::endl;
   }
 
-  SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2.0f; });
+  SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; });
+}
+
+int main() {
+  test_range();
+  test_nd_range();
 }
diff --git a/Code_Exercises/Data_Parallelism/source.cpp b/Code_Exercises/Data_Parallelism/source.cpp
index 323e7527..b5a3efae 100644
--- a/Code_Exercises/Data_Parallelism/source.cpp
+++ b/Code_Exercises/Data_Parallelism/source.cpp
@@ -44,11 +44,11 @@
 int main() {
   constexpr size_t dataSize = 1024;
 
-  float a[dataSize], b[dataSize], r[dataSize];
+  int a[dataSize], b[dataSize], r[dataSize];
   for (int i = 0; i < dataSize; ++i) {
-    a[i] = static_cast<float>(i);
-    b[i] = static_cast<float>(i);
-    r[i] = 0.0f;
+    a[i] = i;
+    b[i] = i;
+    r[i] = 0;
   }
 
   // Task: Compute r[i] = a[i] + b[i] in parallel on the SYCL device
@@ -56,5 +56,5 @@ int main() {
     r[i] = a[i] + b[i];
   }
 
-  SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2.0f; });
+  SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; });
 }
diff --git a/Code_Exercises/ND_Range_Kernel/CMakeLists.txt b/Code_Exercises/ND_Range_Kernel/CMakeLists.txt
deleted file mode 100644
index e9019b2f..00000000
--- a/Code_Exercises/ND_Range_Kernel/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-#[[
-  SYCL Academy (c)
-
-  SYCL Academy is licensed under a Creative Commons Attribution-ShareAlike 4.0
-  International License.
-
-  You should have received a copy of the license along with this work.  If not,
-  see <http://creativecommons.org/licenses/by-sa/4.0/>.
-]]
-
-add_sycl_executable(ND_Range_Kernel source)
-if(SYCL_ACADEMY_ENABLE_SOLUTIONS)
-  add_sycl_executable(ND_Range_Kernel solution)
-endif()
diff --git a/Code_Exercises/ND_Range_Kernel/README.md b/Code_Exercises/ND_Range_Kernel/README.md
deleted file mode 100644
index e11a73be..00000000
--- a/Code_Exercises/ND_Range_Kernel/README.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# SYCL Academy
-
-## ND Range Kernel
----
-
-In this exercise you will learn how to enqueue ND range kernel functions.
-
----
-
-### 1.) Use items in parallel_for
-
-Using the application from any exercise so far or creating a new one, enqueue a
-kernel function using the `parallel_for` variant which takes a `range` but has
-the kernel function take an `item`.
-
-Feel free to use either the buffer/accessor model and feel free to use any
-method of synchronization and copy back.
-
-When using an `item` you cannot pass this directly to the subscript operator of
-an `accessor` you have to retrieve the `id`, you can do this by calling the
-`get_id` member function.
-
-### 2.) Enqueue an ND range kernel
-
-Using the application from any exercise so far or creating a new one, enqueue an
-ND range kernel function using the `parallel_for` variant which takes an
-`nd_range`.
-
-Remember an `nd_range` is made up of two `range`s, the first being the global
-range and the second being the local range or the work-group size.
-
-Remember that when using this variant of `parallel_for` the kernel function
-takes an `nd_item`.
-
-Similarly to to the `item` when using the `nd_item` you cannot pass this
-directly to the subscript operator of an `accessor`, you can retrieve the `id`
-by calling the `get_global_id` member function.
-
-Feel free to use either the buffer/accessor model and feel free to use any
-method of synchronization and copy back.
-
-## Build and execution hints
-
-For DevCloud via JupiterLab follow these [instructions](../devcloudJupyter.md).
-
-For DPC++: [instructions](../dpcpp.md).
-
-For AdaptiveCpp: [instructions](../adaptivecpp.md).
diff --git a/Code_Exercises/ND_Range_Kernel/solution.cpp b/Code_Exercises/ND_Range_Kernel/solution.cpp
deleted file mode 100644
index 24f1c746..00000000
--- a/Code_Exercises/ND_Range_Kernel/solution.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- SYCL Academy (c)
-
- SYCL Academy is licensed under a Creative Commons
- Attribution-ShareAlike 4.0 International License.
-
- You should have received a copy of the license along with this
- work.  If not, see <http://creativecommons.org/licenses/by-sa/4.0/>.
-*/
-
-#include <sycl/sycl.hpp>
-
-#include "../helpers.hpp"
-
-class vector_add_1;
-class vector_add_2;
-
-void test_item() {
-  constexpr size_t dataSize = 1024;
-
-  int a[dataSize], b[dataSize], r[dataSize];
-  for (int i = 0; i < dataSize; ++i) {
-    a[i] = i;
-    b[i] = i;
-    r[i] = 0;
-  }
-
-  try {
-    auto gpuQueue = sycl::queue{sycl::gpu_selector_v};
-
-    auto bufA = sycl::buffer{a, sycl::range{dataSize}};
-    auto bufB = sycl::buffer{b, sycl::range{dataSize}};
-    auto bufR = sycl::buffer{r, sycl::range{dataSize}};
-
-    gpuQueue.submit([&](sycl::handler& cgh) {
-      sycl::accessor accA{bufA, cgh, sycl::read_only};
-      sycl::accessor accB{bufB, cgh, sycl::read_only};
-      sycl::accessor accR{bufR, cgh, sycl::write_only};
-
-      cgh.parallel_for<vector_add_1>(
-          sycl::range{dataSize}, [=](sycl::item<1> itm) {
-            auto globalId = itm.get_id();
-            accR[globalId] = accA[globalId] + accB[globalId];
-          });
-    });
-
-    gpuQueue.throw_asynchronous();
-  } catch (const sycl::exception& e) {
-    std::cout << "Exception caught: " << e.what() << std::endl;
-  }
-
-  SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; });
-}
-
-void test_nd_item() {
-  constexpr size_t dataSize = 1024;
-  constexpr size_t workGroupSize = 128;
-
-  int a[dataSize], b[dataSize], r[dataSize];
-  for (int i = 0; i < dataSize; ++i) {
-    a[i] = i;
-    b[i] = i;
-    r[i] = 0;
-  }
-
-  try {
-    auto gpuQueue = sycl::queue{sycl::gpu_selector_v};
-
-    auto bufA = sycl::buffer{a, sycl::range{dataSize}};
-    auto bufB = sycl::buffer{b, sycl::range{dataSize}};
-    auto bufR = sycl::buffer{r, sycl::range{dataSize}};
-
-    gpuQueue.submit([&](sycl::handler& cgh) {
-      sycl::accessor accA{bufA, cgh, sycl::read_write};
-      sycl::accessor accB{bufB, cgh, sycl::read_write};
-      sycl::accessor accR{bufR, cgh, sycl::read_write};
-
-      auto ndRange =
-          sycl::nd_range{sycl::range{dataSize}, sycl::range{workGroupSize}};
-
-      cgh.parallel_for<vector_add_2>(ndRange, [=](sycl::nd_item<1> itm) {
-        auto globalId = itm.get_global_id();
-        accR[globalId] = accA[globalId] + accB[globalId];
-      });
-    });
-
-    gpuQueue.throw_asynchronous();
-  } catch (const sycl::exception& e) {
-    std::cout << "Exception caught: " << e.what() << std::endl;
-  }
-
-  SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; });
-}
-
-int main() {
-  test_item();
-  test_nd_item();
-}
diff --git a/Code_Exercises/ND_Range_Kernel/source.cpp b/Code_Exercises/ND_Range_Kernel/source.cpp
deleted file mode 100644
index 7ce162ea..00000000
--- a/Code_Exercises/ND_Range_Kernel/source.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- SYCL Academy (c)
-
- SYCL Academy is licensed under a Creative Commons
- Attribution-ShareAlike 4.0 International License.
-
- You should have received a copy of the license along with this
- work.  If not, see <http://creativecommons.org/licenses/by-sa/4.0/>.
-
- * SYCL Quick Reference
- * ~~~~~~~~~~~~~~~~~~~~
- *
- * // Default construct a queue
- * auto q = sycl::queue{};
- *
- * // Construct an in-order queue
- * auto q = sycl::queue{sycl::default_selector_v,
- *        {sycl::property::queue::in_order{}}};
- *
- * // Declare a buffer pointing to ptr
- * auto buf = sycl::buffer{ptr, sycl::range{n}};
- *
- * // Do a USM memcpy
- * auto event = q.memcpy(dst_ptr, src_ptr, sizeof(T)*n);
- * // Do a USM memcpy with dependent events
- * auto event = q.memcpy(dst_ptr, src_ptr, sizeof(T)*n, {event1, event2});
- *
- * // Wait on an event
- * event.wait();
- *
- * // Wait on a queue
- * q.wait();
- *
- * // Submit work to the queue
- * auto event = q.submit([&](sycl::handler &cgh) {
- *   // COMMAND GROUP
- * });
- *
- *
- * // Within the command group you can
- * //    1. Declare an accessor to a buffer
- *          auto read_write_acc = sycl::accessor{buf, cgh};
- *          auto read_acc = sycl::accessor{buf, cgh, sycl::read_only};
- *          auto write_acc = sycl::accessor{buf, cgh, sycl::write_only};
- *          auto no_init_acc = sycl::accessor{buf, cgh, sycl::no_init};
- * //    2. Enqueue a parallel for:
- * //             i: With range:
- *                    cgh.parallel_for<class mykernel>(sycl::range{n},
- *                    [=](sycl::id<1> i) { // Do something });
- * //             ii: With nd_range:
- *                    cgh.parallel_for<class mykernel>(sycl::nd_range{
- *                        globalRange, localRange}, [=](sycl::nd_item<1> i) {
- *                        // Do something
- *                      });
-*/
-
-#include "../helpers.hpp"
-
-int main() {
-  constexpr size_t dataSize = 1024;
-
-  int a[dataSize], b[dataSize], r[dataSize];
-  for (int i = 0; i < dataSize; ++i) {
-    a[i] = i;
-    b[i] = i;
-    r[i] = 0;
-  }
-
-  // Task: parallelise the vector add kernel using nd_range
-  for (int i = 0; i < dataSize; ++i) {
-    r[i] = a[i] + b[i];
-  }
-
-  SYCLACADEMY_ASSERT_EQUAL(r, [](size_t i) { return i * 2; });
-}
diff --git a/Lesson_Materials/Data_Parallelism/index.html b/Lesson_Materials/Data_Parallelism/index.html
index c1018916..2f2e571e 100644
--- a/Lesson_Materials/Data_Parallelism/index.html
+++ b/Lesson_Materials/Data_Parallelism/index.html
@@ -153,6 +153,40 @@
 						</div>
 					</div>
 				</section>
+				<!--Slide 5-->
+				<section>
+					<div class="hbox" data-markdown>
+						#### SYCL execution model
+					</div>
+					<div class="container">
+						<div class="col" data-markdown>
+							* SYCL kernel functions are invoked within an **nd-range**
+							* An nd-range has a number of work-groups and subsequently a number of work-items
+							* Work-groups always have the same number of work-items
+						</div>
+						<div class="col" data-markdown>
+							![ND-Range](../common-revealjs/images/ndrange.png "ND-Range")
+						</div>
+					</div>
+				</section>
+				<!--Slide 6-->
+				<section>
+					<div class="hbox" data-markdown>
+						#### SYCL execution model
+					</div>
+					<div class="container">
+						<div class="col" data-markdown>
+							* The nd-range describes an **iteration space**: how it is composed in terms of work-groups and work-items
+							* An nd-range can be 1, 2 or 3 dimensions
+							* An nd-range has two components
+							  * The **global-range** describes the total number of work-items in each dimension
+							  * The **local-range** describes the number of work-items in a work-group in each dimension
+						</div>
+						<div class="col" data-markdown>
+							![ND-Range](../common-revealjs/images/ndrange-example.png "ND-Range")
+						</div>
+					</div>
+				</section>
 				<!--Slide 7-->
 				<section>
 					<div class="hbox" data-markdown>
@@ -160,11 +194,100 @@
 					</div>
 					<div class="container">
 						<div class="col" data-markdown>
-							* Work-items are launched in parallel in a `sycl::range`.
-							* In order to maximize parallelism, the range should correspond to the problem size.
+							* Each invocation in the iteration space of an nd-range is a work-item
+							* Each invocation knows which work-item it is on and can query certain information about its position in the nd-range
+							* Each work-item has the following:
+							  * **Global range**: {12, 12}
+							  * **Global id**: {5, 6}
+							  * **Group range**: {3, 3}
+							  * **Group id**: {1, 1}
+							  * **Local range**: {4, 4}
+							  * **Local id**: {1, 2}
+						</div>
+						<div class="col" data-markdown>
+							![ND-Range](../common-revealjs/images/ndrange-example-work-item.png "ND-Range")
+						</div>
+					</div>
+				</section>
+				<!--Slide 8-->
+				<section>
+					<div class="hbox" data-markdown>
+						#### SYCL execution model
+					</div>
+					<div class="container">
+						<div class="col" data-markdown>
+							Typically an nd-range invocation SYCL will execute the SYCL kernel function on a very large number of work-items, often in the thousands
+						</div>
+						<div class="col" data-markdown>
+							![ND-Range](../common-revealjs/images/ndrange-invocation.png "ND-Range")
+						</div>
+					</div>
+				</section>
+				<!--Slide 9-->
+				<section>
+					<div class="hbox" data-markdown>
+						#### SYCL execution model
+					</div>
+					<div class="container">
+						<div class="col" data-markdown>
+							* Multiple work-items will generally execute concurrently
+							* On vector hardware this is often done in lock-step, which means the same hardware instructions
+							* The number of work-items that will execute concurrently can vary from one device to another
+							* Work-items will be batched along with other work-items in the same work-group
+							* The order work-items and work-groups are executed in is implementation defined
+						</div>
+						<div class="col" data-markdown>
+							![ND-Range](../common-revealjs/images/ndrange-lock-step.png "ND-Range")
+						</div>
+					</div>
+				</section>
+				<!--Slide 10-->
+				<section>
+					<div class="hbox" data-markdown>
+						#### SYCL execution model
+					</div>
+					<div class="container">
+						<div class="col" data-markdown>
+							* Work-items in a work-group can be synchronized using a work-group barrier
+							  * All work-items within a work-group must reach the barrier before any can continue on
+						</div>
+						<div class="col" data-markdown>
+							![ND-Range](../common-revealjs/images/work-group-0.png "ND-Range")
+						</div>
+					</div>
+				</section>
+				<!--Slide 12-->
+				<section>
+					<div class="hbox" data-markdown>
+						#### SYCL execution model
+					</div>
+					<div class="container">
+						<div class="col" data-markdown>
+							* SYCL does not support synchronizing across all work-items in the nd-range
+							* The only way to do this is to split the computation into separate SYCL kernel functions
 						</div>
 						<div class="col" data-markdown>
-							![Work-Group](../common-revealjs/images/SYCL_range.png "Work-Group")
+							![ND-Range](../common-revealjs/images/work-group-0-1.png "ND-Range")
+						</div>
+					</div>
+				</section>
+				<!--Slide 7-->
+				<section>
+					<div class="hbox" data-markdown>
+						#### SYCL execution model
+					</div>
+					<div style="display: grid; grid-template-columns: 5fr 2fr;">
+						<div class="container" data-markdown>
+							* SYCL also provides a simplified execution model with `sycl::range` in place of `sycl::nd_range`
+							* Caller only provides the global range
+							* Local range is decided by the runtime and cannot be inspected
+							* No synchronization is possible between work items
+							* Useful for simple problems which don't require synchronization, local memory and ultimate performance
+							   * Runtime may not always have enough information to choose the best-performing size
+						</div>
+						<div style="text-align: right;">
+							<img src="../common-revealjs/images/ndrange.png" alt="ND-Range" style="width:90%" /><br />
+							<img src="../common-revealjs/images/SYCL_range.png" alt="SYCL-Range" style="width:90%" />
 						</div>
 					</div>
 				</section>
@@ -177,8 +300,8 @@
 					<div class="container">
 						<div class="col">
 							<code><pre>
-cgh.<mark>parallel_for</mark>&lt;my_kernel&gt;(<mark>range{64, 64}</mark>, 
-                          [=](id&lt;2&gt; idx){
+cgh.<mark>parallel_for</mark>&lt;my_kernel&gt;(<mark>nd_range{{1024, 16}, {32, 4}}</mark>,
+                          [=](<mark>nd_item&lt;2&gt; item)</mark>{
   // SYCL kernel function is executed 
   // on a range of work-items
 });
@@ -187,74 +310,207 @@
 					</div>
 					<div class="container" data-markdown>
 					  * In SYCL, kernel functions can be enqueued to execute
-					  over a range of work-items using `parallel_for`.
-					  * When using `parallel_for` you must also pass `range`
-					  which describes the iteration space over which the kernel
-					  is to be executed.
+					  over a range of work-items using `parallel_for`
+					  * The first argument to `parallel_for` is an `nd_range` or
+					  a `range` which describes the iteration space over which
+					  the kernel is to be executed
+					  * The kernel function has to take an `nd_item` or `item`,
+					  respectively, as the parameter (or any type they can be
+					  implicitly converted to, commonly from `item` to `id`)
 					</div>
 				</section>
-				<!--Slide 15-->
+				<!--Slide 16-->
 				<section>
 					<div class="hbox" data-markdown>
-						#### Parallel_for
+						#### Expressing parallelism
 					</div>
 					<div class="container">
-						<div class="col">
-							<code><pre>
-cgh.parallel_for&lt;my_kernel&gt;(range{64, 64}, 
-                          [=](<mark>id&lt;2&gt; idx</mark>){
-  // SYCL kernel function is executed 
-  // on a range of work-items
+						<div style="font-size: 90%; display: grid; grid-template-columns: 45% 55%; grid-template-rows: 1fr 1fr 1fr;">
+							<div style="margin: auto 0; vertical-align: middle;">
+								<code><pre>
+cgh.parallel_for&lt;kernel&gt;((<mark>nd_range&lt;1&gt;{1024,32}</mark>,
+  [=](<mark>nd_item&lt;1&gt; ndItem</mark>){
+    /* kernel function code */
+    id globalId = ndItem.get_global_id();
+    id localId = ndItem.get_local_id();
 });
-							</code></pre>
+								</code></pre>
+							</div>
+							<div style="margin: auto 0; vertical-align: middle;" data-markdown>
+								* Overload taking an `nd_range` object specifies the global and local range
+								* An `nd_item` parameter represents the global and local range and index
+							</div>
+							<div style="margin: auto 0; vertical-align: middle;">
+								<code><pre>
+cgh.parallel_for&lt;kernel&gt;(<mark>range&lt;1&gt;{1024}</mark>,
+  [=](<mark>item&lt;1&gt; item</mark>){
+    /* kernel function code */
+    id globalId = item.get_id();
+});
+								</code></pre>
+							</div>
+							<div style="margin: auto 0; vertical-align: middle;" data-markdown>
+								* Overload taking a `range` object specifies the global range, runtime decides local range
+								* An `item` parameter represents the global range and the index within the global range
+							</div>
+							<div style="margin: auto 0; vertical-align: middle;">
+								<code><pre>
+cgh.parallel_for&lt;kernel&gt;(<mark>range&lt;1&gt;{1024}</mark>,
+  [=](<mark>id&lt;1&gt; globalId</mark>){
+    /* kernel function code */
+});
+								</code></pre>
+							</div>
+							<div style="margin: auto 0; vertical-align: middle;" data-markdown>
+								* Overload taking a `range` object specifies the global range, runtime decides local range
+								* An `id` parameter represents the index within the global range
+							</div>
 						</div>
 					</div>
-					<div class="container" data-markdown>
-					  * When using `parallel_for` you must also have the
-					  function object which represents the kernel function take
-					  an `id`.
-					  * This represents the current work-item being executed and
-					  its position within the iteration space.
+				</section>
+				<!--Slide 14-->
+				<section>
+					<div class="hbox" data-markdown>
+						#### SYCL memory model
+					</div>
+					<div class="container">
+						<div class="col" data-markdown>
+							* Each work-item can access a dedicated region of **private memory**
+							* A work-item cannot access the private memory of another work-item
+						</div>
+						<div class="col" data-markdown>
+							![Private Memory](../common-revealjs/images/workitem-privatememory.png "Private Memory")
+						</div>
+
+					</div>
+				</section>
+				<!--Slide 15-->
+				<section>
+					<div class="hbox" data-markdown>
+						#### SYCL memory model
+					</div>
+					<div class="container">
+						<div class="col-left-3" data-markdown>
+							![Local Memory](../common-revealjs/images/workitem-localmemory.png "Local Memory")
+						</div>
+						<div class="col-right-2" data-markdown>
+							* Each work-item can access a dedicated region of **local memory** accessible to all work-items in a work-group
+							* A work-item cannot access the local memory of another work-group
+						</div>
 					</div>
 				</section>
 				<!--Slide 16-->
 				<section>
 					<div class="hbox" data-markdown>
-						#### Expressing parallelism
+						#### SYCL memory model
 					</div>
 					<div class="container">
-						<div class="col">
-							<code><pre>
-							
-cgh.parallel_for&ltkernel&gt(<mark>range&lt1&gt(1024)</mark>, 
-  [=](<mark>id&lt1&gt idx</mark>){
-    /* kernel function code */
-});
-							</code></pre>
+						<div class="col-left-3" data-markdown>
+							![Global Memory](../common-revealjs/images/workitem-constantmemory.png "Global Memory")
+						</div>
+						<div class="col-right-2" data-markdown>
+							* Each work-item can access a single region of **global memory** that's accessible to all work-items in a ND-range
+						</div>
+
+					</div>
+				</section>
+				<!--Slide 17-->
+				<section>
+					<div class="hbox" data-markdown>
+						#### SYCL memory model
+					</div>
+					<div class="container">
+						<div class="col" data-markdown>
+							* Each memory region has a different size and access latency
+							* Global memory is larger than local memory and local memory is larger than private memory
+							* Private memory is faster than local memory and local memory is faster than global memory
+						</div>
+						<div class="col" data-markdown>
+							![Memory Regions](../common-revealjs/images/memory-regions.png "Memory Regions")
+						</div>
+					</div>
+				</section>
+				<!--Slide 22-->
+				<section>
+					<div class="hbox" data-markdown>
+						#### Accessing Data With Accessors
+					</div>
+					<div class="container" data-markdown>
+					* There are a few different ways to access the data represented by an accessor
+					  *  The subscript operator can take an **id**
+					    * Must be the same dimensionality of the accessor
+					    * For dimensions > 1, linear address is calculated in row major
+					* Nested subscript operators can be called for each dimension taking a **size_t**
+					  * E.g. a 3-dimensional accessor: acc[x][y][z] = …
+					* A pointer to memory can be retrieved by calling **get_pointer**
+					  * This returns a raw pointer to the data
+					</div>
+				</section>
+				<!--Slide 23-->
+				<section>
+					<div class="hbox" data-markdown>
+						#### Accessing Data With Accessors
+					</div>
+					<div class="container">
+						<div class="col-left-3">
 							<code><pre>
-							
-cgh.parallel_for&ltkernel&gt(<mark>range&lt1&gt(1024)</mark>, 
-  [=](<mark>item&lt1&gt item</mark>){
-    /* kernel function code */
+buffer&ltfloat, 1&gt bufA(dA.data(), range&lt1&gt(dA.size()));
+buffer&ltfloat, 1&gt bufB(dB.data(), range&lt1&gt(dB.size()));
+buffer&ltfloat, 1&gt bufO(dO.data(), range&lt1&gt(dO.size()));
+
+gpuQueue.submit([&](handler &cgh){
+  sycl::accessor inA{bufA, cgh, sycl::read_only};
+  sycl::accessor inB{bufB, cgh, sycl::read_only};
+  sycl::accessor out{bufO, cgh, sycl::write_only};
+  cgh.parallel_for&ltadd&gt(range&lt1&gt(dA.size()),
+    [=](id&lt1&gt i){
+    <mark>out[i] = inA[i] + inB[i];</mark>
+  });
 });
 							</code></pre>
+						</div>
+						<div class="col-right-2" data-markdown>
+							* Here we access the data of the `accessor` by
+							passing in the `id` passed to the SYCL kernel
+							function.
+						</div>
+					</div>
+				</section>
+				<!--Slide 24-->
+				<section>
+					<div class="hbox" data-markdown>
+						#### Accessing Data With Accessors
+					</div>
+					<div class="container">
+						<div class="col-left-3">
 							<code><pre>
-							
-cgh.parallel_for&ltkernel&gt(nd_range&lt1&gt(<mark>range&lt1&gt(1024), 
-  range&lt1&gt(32))</mark>,[=](<mark>nd_item&lt1&gt ndItem</mark>){
-    /* kernel function code */
+buffer&ltfloat, 1&gt bufA(dA.data(), range&lt1&gt(dA.size()));
+buffer&ltfloat, 1&gt bufB(dB.data(), range&lt1&gt(dB.size()));
+buffer&ltfloat, 1&gt bufO(dO.data(), range&lt1&gt(dO.size()));
+
+gpuQueue.submit([&](handler &cgh){
+  sycl::accessor inA{bufA, cgh, sycl::read_only};
+  sycl::accessor inB{bufB, cgh, sycl::read_only};
+  sycl::accessor out{bufO, cgh, sycl::write_only};
+  cgh.parallel_for&ltadd&gt(rng, [=](item&lt3&gt i){
+    <mark>auto ptrA = inA.get_pointer();</mark>
+    <mark>auto ptrB = inB.get_pointer();</mark>
+    <mark>auto ptrO = out.get_pointer();</mark>
+    <mark>auto linearId = i.get_linear_id();</mark>
+
+    <mark>ptrA[linearId] = ptrB[linearId] + ptrO[linearId]; </mark>
+  });
 });
 							</code></pre>
 						</div>
-						<div class="col" data-markdown>
-							* Overload taking a **range** object specifies the global range, runtime decides local range
-							* An **id** parameter represents the index within the global range
-							____________________________________________________________________________________________
-							* Overload taking a **range** object specifies the global range, runtime decides local range
-							* An **item** parameter represents the global range and the index within the global range
-							____________________________________________________________________________________________
-							* Overload taking an **nd_range** object specifies the global and local range
-							* An **nd_item** parameter represents the global and local range and index
+						<div class="col-right-2" data-markdown>
+							* Here we retrieve the underlying pointer for each
+							of the `accessor`s.
+							* We then access the pointer using the linearized
+							`id` by calling the `get_linear_id` member function
+							on the `item`.
+							* Again this linearization is calculated in
+							row-major order.
 						</div>
 					</div>
 				</section>
@@ -271,8 +527,11 @@
 						Code_Exercises/Data_Parallelism/source.cpp
 					</div>
 					<div class="container" data-markdown>
-						Implement a SYCL application that adds two arrays of
-						values together in parallel using `parallel_for`.
+						Implement a SYCL application using `parallel_for` to add two arrays of values
+					</div>
+					<div class="container" data-markdown>
+						* Use buffers and accessors to manage data
+						* Try the `sycl::range` and `sycl::nd_range` variants
 					</div>
 				</section>
 			</div>
diff --git a/Lesson_Materials/ND_Range_Kernel/index.html b/Lesson_Materials/ND_Range_Kernel/index.html
deleted file mode 100644
index 2c50d585..00000000
--- a/Lesson_Materials/ND_Range_Kernel/index.html
+++ /dev/null
@@ -1,431 +0,0 @@
-﻿<!DOCTYPE html>
-
-<html>
-	<head>
-	    <meta charset="utf-8">
-		<link rel="stylesheet" href="../common-revealjs/css/reveal.css">
-		<link rel="stylesheet" href="../common-revealjs/css/theme/white.css">
-		<link rel="stylesheet" href="../common-revealjs/css/custom.css">
-		<script>
-			// This is needed when printing the slides to pdf
-			var link = document.createElement( 'link' );
-			link.rel = 'stylesheet';
-			link.type = 'text/css';
-			link.href = window.location.search.match( /print-pdf/gi ) ? '../common-revealjs/css/print/pdf.css' : '../common-revealjs/css/print/paper.css';
-			document.getElementsByTagName( 'head' )[0].appendChild( link );
-		</script>
-		<script>
-		    // This is used to display the static images on each slide,
-			// See global-images in this html file and custom.css
-			(function() {
-				if(window.addEventListener) {
-					window.addEventListener('load', () => {
-						let slides = document.getElementsByClassName("slide-background");
-
-						if (slides.length === 0) {
-							slides = document.getElementsByClassName("pdf-page")
-						}
-
-						// Insert global images on each slide
-						for(let i = 0, max = slides.length; i < max; i++) {
-							let cln = document.getElementById("global-images").cloneNode(true);
-							cln.removeAttribute("id");
-							slides[i].appendChild(cln);
-						}
-
-						// Remove top level global images
-						let elem = document.getElementById("global-images");
-						elem.parentElement.removeChild(elem);
-					}, false);
-				}
-			})();
-		</script>
-		
-	</head>
-	<body>
-		<div class="reveal">
-			<div class="slides">
-				<div id="global-images" class="global-images">
-					<img src="../common-revealjs/images/sycl_academy.png" />
-					<img src="../common-revealjs/images/sycl_logo.png" />
-					<img src="../common-revealjs/images/trademarks.png" />
-					<img src="../common-revealjs/images/codeplay.png" />
-				</div>
-				<!--Slide 1-->
-				<section class="hbox" data-markdown>
-					## ND Range Kernels
-				</section>
-				<!--Slide 2-->
-				<section class="hbox" data-markdown>
-					## Learning Objectives
-					* Learn about the SYCL execution and memory model
-					* Learn how to enqueue an nd-range kernel function
-				</section>
-				<!--Slide 3-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### SYCL execution model
-					</div>
-					<div class="container">
-						<div class="col-left" data-markdown>
-							* SYCL kernel functions are executed by **work-items**
-							* You can think of a work-item as a thread of execution
-							* Each work-item will execute a SYCL kernel function from start to end
-							* A work-item can run on CPU threads, SIMD lanes, GPU threads, or any other kind of processing element
-						</div>
-						<div class="col-right" data-markdown>
-							![Work-Item](../common-revealjs/images/workitem.png "Work-Item")
-						</div>
-
-					</div>
-				</section>
-				<!--Slide 4-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### SYCL execution model
-					</div>
-					<div class="container">
-						<div class="col" data-markdown>
-							* Work-items are collected together into **work-groups**
-							* The size of work-groups is generally relative to what is optimal on the device being targeted
-							* It can also be affected by the resources used by each work-item
-						</div>
-						<div class="col" data-markdown>
-							![Work-Group](../common-revealjs/images/workgroup.png "Work-Group")
-						</div>
-					</div>
-				</section>
-				<!--Slide 5-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### SYCL execution model
-					</div>
-					<div class="container">
-						<div class="col" data-markdown>
-							* SYCL kernel functions are invoked within an **nd-range**
-							* An nd-range has a number of work-groups and subsequently a number of work-items
-							* Work-groups always have the same number of work-items
-						</div>
-						<div class="col" data-markdown>
-							![ND-Range](../common-revealjs/images/ndrange.png "ND-Range")
-						</div>
-					</div>
-				</section>
-				<!--Slide 6-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### SYCL execution model
-					</div>
-					<div class="container">
-						<div class="col" data-markdown>
-							* The nd-range describes an **iteration space**: how it is composed in terms of work-groups and work-items
-							* An nd-range can be 1, 2 or 3 dimensions
-							* An nd-range has two components
-							  * The **global-range** describes the total number of work-items in each dimension
-							  * The **local-range** describes the number of work-items in a work-group in each dimension
-						</div>
-						<div class="col" data-markdown>
-							![ND-Range](../common-revealjs/images/ndrange-example.png "ND-Range")
-						</div>
-					</div>
-				</section>
-				<!--Slide 7-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### SYCL execution model
-					</div>
-					<div class="container">
-						<div class="col" data-markdown>
-							* Each invocation in the iteration space of an nd-range is a work-item
-							* Each invocation knows which work-item it is on and can query certain information about its position in the nd-range
-							* Each work-item has the following:
-							  * **Global range**: {12, 12}
-							  * **Global id**: {5, 6}
-							  * **Group range**: {3, 3}
-							  * **Group id**: {1, 1}
-							  * **Local range**: {4, 4}
-							  * **Local id**: {1, 2}
-						</div>
-						<div class="col" data-markdown>
-							![ND-Range](../common-revealjs/images/ndrange-example-work-item.png "ND-Range")
-						</div>
-					</div>
-				</section>
-				<!--Slide 8-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### SYCL execution model
-					</div>
-					<div class="container">
-						<div class="col" data-markdown>
-							Typically an nd-range invocation SYCL will execute the SYCL kernel function on a very large number of work-items, often in the thousands
-						</div>
-						<div class="col" data-markdown>
-							![ND-Range](../common-revealjs/images/ndrange-invocation.png "ND-Range")
-						</div>
-					</div>
-				</section>
-				<!--Slide 9-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### SYCL execution model
-					</div>
-					<div class="container">
-						<div class="col" data-markdown>
-							* Multiple work-items will generally execute concurrently
-							* On vector hardware this is often done in lock-step, which means the same hardware instructions
-							* The number of work-items that will execute concurrently can vary from one device to another
-							* Work-items will be batched along with other work-items in the same work-group
-							* The order work-items and work-groups are executed in is implementation defined
-						</div>
-						<div class="col" data-markdown>
-							![ND-Range](../common-revealjs/images/ndrange-lock-step.png "ND-Range")
-						</div>
-					</div>
-				</section>
-				<!--Slide 10-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### SYCL execution model
-					</div>
-					<div class="container">
-						<div class="col" data-markdown>
-							* Work-items in a work-group can be synchronized using a work-group barrier
-							  * All work-items within a work-group must reach the barrier before any can continue on
-						</div>
-						<div class="col" data-markdown>
-							![ND-Range](../common-revealjs/images/work-group-0.png "ND-Range")
-						</div>
-					</div>
-				</section>
-				<!--Slide 12-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### SYCL execution model
-					</div>
-					<div class="container">
-						<div class="col" data-markdown>
-							* SYCL does not support synchronizing across all work-items in the nd-range
-							* The only way to do this is to split the computation into separate SYCL kernel functions
-						</div>
-						<div class="col" data-markdown>
-							![ND-Range](../common-revealjs/images/work-group-0-1.png "ND-Range")
-						</div>
-					</div>
-				</section>
-				<!--Slide 14-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### SYCL memory model
-					</div>
-					<div class="container">
-						<div class="col" data-markdown>
-							* Each work-item can access a dedicated region of **private memory**
-							* A work-item cannot access the private memory of another work-item
-						</div>
-						<div class="col" data-markdown>
-							![Private Memory](../common-revealjs/images/workitem-privatememory.png "Private Memory")
-						</div>
-
-					</div>
-				</section>
-				<!--Slide 15-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### SYCL memory model
-					</div>
-					<div class="container">
-						<div class="col-left-3" data-markdown>
-							![Local Memory](../common-revealjs/images/workitem-localmemory.png "Local Memory")
-						</div>
-						<div class="col-right-2" data-markdown>
-							* Each work-item can access a dedicated region of **local memory** accessible to all work-items in a work-group
-							* A work-item cannot access the local memory of another work-group
-						</div>
-					</div>
-				</section>
-				<!--Slide 16-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### SYCL memory model
-					</div>
-					<div class="container">
-						<div class="col-left-3" data-markdown>
-							![Constant Memory](../common-revealjs/images/workitem-constantmemory.png "Constant Memory")
-						</div>
-						<div class="col-right-2" data-markdown>
-							* Each work-item can access a single region of **global memory** that's accessible to all work-items in a ND-range
-							* Each work-item can also access a region of global memory reserved as **constant memory**, which is read-only
-						</div>
-
-					</div>
-				</section>
-				<!--Slide 17-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### SYCL memory model
-					</div>
-					<div class="container">
-						<div class="col" data-markdown>
-							* Each memory region has a different size and access latency 
-							* Global / constant memory is larger than local memory and local memory is larger than private memory
-							* Private memory is faster than local memory and local memory is faster than global / constant memory
-						</div>
-						<div class="col" data-markdown>
-							![Memory Regions](../common-revealjs/images/memory-regions.png "Memory Regions")
-						</div>
-					</div>
-				</section>
-				<!--Slide 19-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### Expressing parallelism
-					</div>
-					<div class="container">
-						<div class="col">
-							<code><pre>
-							
-cgh.parallel_for&ltkernel&gt(<mark>range&lt1&gt(1024)</mark>, 
-  [=](<mark>id&lt1&gt idx</mark>){
-    /* kernel function code */
-});
-							</code></pre>
-							<code><pre>
-							
-cgh.parallel_for&ltkernel&gt(<mark>range&lt1&gt(1024)</mark>, 
-  [=](<mark>item&lt1&gt item</mark>){
-    /* kernel function code */
-});
-							</code></pre>
-							<code><pre>
-							
-cgh.parallel_for&ltkernel&gt(nd_range&lt1&gt(<mark>range&lt1&gt(1024), 
-  range&lt1&gt(32))</mark>,[=](<mark>nd_item&lt1&gt ndItem</mark>){
-    /* kernel function code */
-});
-							</code></pre>
-						</div>
-						<div class="col" data-markdown>
-							* Overload taking a **range** object specifies the global range, runtime decides local range
-							* An **id** parameter represents the index within the global range
-							____________________________________________________________________________________________
-							* Overload taking a **range** object specifies the global range, runtime decides local range
-							* An **item** parameter represents the global range and the index within the global range
-							____________________________________________________________________________________________
-							* Overload taking an **nd_range** object specifies the global and local range
-							* An **nd_item** parameter represents the global and local range and index
-						</div>
-					</div>
-				</section>
-				<!--Slide 22-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### Accessing Data With Accessors
-					</div>
-					<div class="container" data-markdown>
-					* There are a few different ways to access the data represented by an accessor
-					  *  The subscript operator can take an **id**
-					    * Must be the same dimensionality of the accessor
-					    * For dimensions > 1, linear address is calculated in row major 
-					* Nested subscript operators can be called for each dimension taking a **size_t**
-					  * E.g. a 3-dimensional accessor: acc[x][y][z] = …
-					* A pointer to memory can be retrieved by calling **get_pointer**
-					  * This returns a raw pointer to the data
-					</div>
-				</section>
-				<!--Slide 23-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### Accessing Data With Accessors
-					</div>
-					<div class="container">
-						<div class="col-left-3">
-							<code><pre>
-buffer&ltfloat, 1&gt bufA(dA.data(), range&lt1&gt(dA.size())); 
-buffer&ltfloat, 1&gt bufB(dB.data(), range&lt1&gt(dB.size())); 
-buffer&ltfloat, 1&gt bufO(dO.data(), range&lt1&gt(dO.size()));
-
-gpuQueue.submit([&](handler &cgh){
-  sycl::accessor inA{bufA, cgh, sycl::read_only};
-  sycl::accessor inB{bufB, cgh, sycl::read_only};
-  sycl::accessor out{bufO, cgh, sycl::write_only};
-  cgh.parallel_for&ltadd&gt(range&lt1&gt(dA.size()), 
-    [=](id&lt1&gt i){ 
-    <mark>out[i] = inA[i] + inB[i];</mark>
-  });
-});
-							</code></pre>
-						</div>
-						<div class="col-right-2" data-markdown>
-							* Here we access the data of the `accessor` by
-							passing in the `id` passed to the SYCL kernel
-							function.
-						</div>
-					</div>
-				</section>
-				<!--Slide 24-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### Accessing Data With Accessors
-					</div>
-					<div class="container">
-						<div class="col-left-3">
-							<code><pre>
-buffer&ltfloat, 1&gt bufA(dA.data(), range&lt1&gt(dA.size())); 
-buffer&ltfloat, 1&gt bufB(dB.data(), range&lt1&gt(dB.size())); 
-buffer&ltfloat, 1&gt bufO(dO.data(), range&lt1&gt(dO.size()));
-
-gpuQueue.submit([&](handler &cgh){
-  sycl::accessor inA{bufA, cgh, sycl::read_only};
-  sycl::accessor inB{bufB, cgh, sycl::read_only};
-  sycl::accessor out{bufO, cgh, sycl::write_only};
-  cgh.parallel_for&ltadd&gt(rng, [=](item&lt3&gt i){
-    <mark>auto ptrA = inA.get_pointer();</mark>
-    <mark>auto ptrB = inB.get_pointer();</mark>
-    <mark>auto ptrO = out.get_pointer();</mark>
-    <mark>auto linearId = i.get_linear_id();</mark>
-
-    <mark>ptrA[linearId] = ptrB[linearId] + ptrO[linearId]; </mark>
-  });
-});
-							</code></pre>
-						</div>
-						<div class="col-right-2" data-markdown>
-							* Here we retrieve the underlying pointer for each
-							of the `accessor`s.
-							* We then access the pointer using the linearized
-							`id` by calling the `get_linear_id` member function
-							on the `item`.
-							* Again this linearization is calculated in
-							row-major order.
-						</div>
-					</div>
-				</section>
-				<!--Slide 25-->
-				<section class="hbox" data-markdown>
-					## Questions
-				</section>
-				<!--Slide 26-->
-				<section>
-					<div class="hbox" data-markdown>
-						#### Exercise
-					</div>
-					<div class="container" data-markdown>
-						Code_Exercises/ND_Range_Kernel/source
-					</div>
-					<div class="container" data-markdown>
-						Implement a SYCL application that will perform a vector add using `parallel_for`, adding multiple elements in parallel.
-					</div>
-				</section>
-			</div>
-		</div>
-		<script src="../common-revealjs/js/reveal.js"></script>
-		<script src="../common-revealjs/plugin/markdown/marked.js"></script>
-		<script src="../common-revealjs/plugin/markdown/markdown.js"></script>
-		<script src="../common-revealjs/plugin/notes/notes.js"></script>
-		<script>
-	Reveal.initialize();
-	Reveal.configure({ slideNumber: true });
-		</script>
-	</body>
-</html>
diff --git a/README.md b/README.md
index aae8a35b..74c0776b 100644
--- a/README.md
+++ b/README.md
@@ -85,15 +85,14 @@ may not match completely.
 | 11 | In Order Queue | [slides][lesson-11-slides] | [exercise][lesson-11-exercise] | [source][lesson-11-source] | [solution][lesson-11-solution] | Yes | Yes |
 | 12 | Advanced Data Flow | [slides][lesson-12-slides] | [exercise][lesson-12-exercise] | [source][lesson-12-source] | [solution][lesson-12-solution] | Yes | Yes |
 | 13 | Multiple Devices | [slides][lesson-13-slides] | [exercise][lesson-13-exercise] | [source][lesson-13-source] | [solution][lesson-13-solution] | Yes | Yes |
-| 14 | ND Range Kernels | [slides][lesson-14-slides] | [exercise][lesson-14-exercise] | [source][lesson-14-source] | [solution][lesson-14-solution] | Yes | Yes |
-| 15 | Image Convolution | [slides][lesson-15-slides] | [exercise][lesson-15-exercise] |  | [solution][lesson-15-solution] | Yes | Yes |
-| 16 | Coalesced Global Memory | [slides][lesson-16-slides] | [exercise][lesson-16-exercise] | [source][lesson-16-source] | [solution][lesson-16-solution] | Yes | Yes |
-| 17 | Vectors | [slides][lesson-17-slides] | [exercise][lesson-17-exercise] | [source][lesson-17-source] | [solution][lesson-17-solution] | Yes | Yes |
-| 18 | Local Memory Tiling | [slides][lesson-18-slides] | [exercise][lesson-18-exercise] | [source][lesson-18-source] | [solution][lesson-18-solution] | Yes | Yes |
-| 19 | Further Optimisations | [slides][lesson-19-slides] | [exercise][lesson-19-exercise] | [source][lesson-19-source] | [solution][lesson-19-solution] | Yes | Yes |
-| 20 | Matrix Transpose | [slides][lesson-20-slides] | [exercise][lesson-20-exercise] | [source][lesson-20-source] | [solution][lesson-20-solution] | Yes | Yes |
-| 21 | More SYCL Features | [slides][lesson-21-slides] | [exercise][lesson-21-exercise] | [source][lesson-21-source] | [solution][lesson-21-solution] | Yes | Yes |
-| 22 | Functors | [slides][lesson-22-slides] | [exercise][lesson-22-exercise] | [source][lesson-22-source] | [solution][lesson-22-solution] | Yes | Yes |
+| 14 | Image Convolution | [slides][lesson-14-slides] | [exercise][lesson-14-exercise] | [source][lesson-14-source] | [solution][lesson-14-solution] | Yes | Yes |
+| 15 | Coalesced Global Memory | [slides][lesson-15-slides] | [exercise][lesson-15-exercise] |  | [solution][lesson-15-solution] | Yes | Yes |
+| 16 | Vectors | [slides][lesson-16-slides] | [exercise][lesson-16-exercise] | [source][lesson-16-source] | [solution][lesson-16-solution] | Yes | Yes |
+| 17 | Local Memory Tiling | [slides][lesson-17-slides] | [exercise][lesson-17-exercise] | [source][lesson-17-source] | [solution][lesson-17-solution] | Yes | Yes |
+| 18 | Further Optimisations | [slides][lesson-18-slides] | [exercise][lesson-18-exercise] | [source][lesson-18-source] | [solution][lesson-18-solution] | Yes | Yes |
+| 19 | Matrix Transpose | [slides][lesson-19-slides] | [exercise][lesson-19-exercise] | [source][lesson-19-source] | [solution][lesson-19-solution] | Yes | Yes |
+| 20 | More SYCL Features | [slides][lesson-20-slides] | [exercise][lesson-20-exercise] | [source][lesson-20-source] | [solution][lesson-20-solution] | Yes | Yes |
+| 21 | Functors | [slides][lesson-21-slides] | [exercise][lesson-21-exercise] | [source][lesson-21-source] | [solution][lesson-21-solution] | Yes | Yes |
 
 
 ## Building the Exercises
@@ -418,47 +417,42 @@ cmake ../ "-GUnix Makefiles" -DSYCL_ACADEMY_USE_DPCPP=ON -DSYCL_ACADEMY_ENABLE_S
 [lesson-13-source]:   ./Code_Exercises/Multiple_Devices/source.cpp
 [lesson-13-solution]: ./Code_Exercises/Multiple_Devices/solution.cpp
 
-[lesson-14-slides]: ./Lesson_Materials/ND_Range_Kernel/
-[lesson-14-exercise]: ./Code_Exercises/ND_Range_Kernel/README.md
-[lesson-14-source]:   ./Code_Exercises/ND_Range_Kernel/source.cpp
-[lesson-14-solution]: ./Code_Exercises/ND_Range_Kernel/solution.cpp
-
-[lesson-15-slides]: ./Lesson_Materials/Image_Convolution/
-[lesson-15-exercise]: ./Code_Exercises/Image_Convolution/README.md
-[lesson-15-source]:   ./Code_Exercises/Image_Convolution/source.cpp
-[lesson-15-solution]: ./Code_Exercises/Image_Convolution/reference.cpp
-
-[lesson-16-slides]: ./Lesson_Materials/Coalesced_Global_Memory/
-[lesson-16-exercise]: ./Code_Exercises/Coalesced_Global_Memory/README.md
-[lesson-16-source]:   ./Code_Exercises/Coalesced_Global_Memory/source.cpp
-[lesson-16-solution]: ./Code_Exercises/Coalesced_Global_Memory/solution.cpp
-
-[lesson-17-slides]: ./Lesson_Materials/Vectors/
-[lesson-17-exercise]: ./Code_Exercises/Vectors/README.md
-[lesson-17-source]:   ./Code_Exercises/Vectors/source.cpp
-[lesson-17-solution]: ./Code_Exercises/Vectors/solution.cpp
-
-[lesson-18-slides]: ./Lesson_Materials/Local_Memory_Tiling/
-[lesson-18-exercise]: ./Code_Exercises/Local_Memory_Tiling/README.md
-[lesson-18-source]:   ./Code_Exercises/Local_Memory_Tiling/source.cpp
-[lesson-18-solution]: ./Code_Exercises/Local_Memory_Tiling/solution.cpp
-
-[lesson-19-slides]: ./Lesson_Materials/Work_Group_Sizes/
-[lesson-19-exercise]: ./Code_Exercises/Work_Group_Sizes/README.md
-[lesson-19-source]:   ./Code_Exercises/Work_Group_Sizes/source.cpp
-[lesson-19-solution]: ./Code_Exercises/Work_Group_Sizes/solution.cpp
-
-[lesson-20-slides]: ./Lesson_Materials/Matrix_Transpose/
-[lesson-20-exercise]: ./Code_Exercises/Matrix_Transpose/README.md
-[lesson-20-source]:   ./Code_Exercises/Matrix_Transpose/source.cpp
-[lesson-20-solution]: ./Code_Exercises/Matrix_Transpose/solution.cpp
-
-[lesson-21-slides]: ./Lesson_Materials/More_SYCL_Features/
-[lesson-21-exercise]: ./Code_Exercises/More_SYCL_Features/README.md
-[lesson-21-source]:   ./Code_Exercises/More_SYCL_Features/source.cpp
-[lesson-21-solution]: ./Code_Exercises/More_SYCL_Features/solution.cpp
-
-[lesson-22-slides]: ./Lesson_Materials/Fast_Track/
-[lesson-22-exercise]: ./Code_Exercises/Functors/README.md
-[lesson-22-source]:   ./Code_Exercises/Functors/source.cpp
-[lesson-22-solution]: ./Code_Exercises/Functors/solution.cpp
+[lesson-14-slides]: ./Lesson_Materials/Image_Convolution/
+[lesson-14-exercise]: ./Code_Exercises/Image_Convolution/README.md
+[lesson-14-source]:   ./Code_Exercises/Image_Convolution/source.cpp
+[lesson-14-solution]: ./Code_Exercises/Image_Convolution/reference.cpp
+
+[lesson-15-slides]: ./Lesson_Materials/Coalesced_Global_Memory/
+[lesson-15-exercise]: ./Code_Exercises/Coalesced_Global_Memory/README.md
+[lesson-15-source]:   ./Code_Exercises/Coalesced_Global_Memory/source.cpp
+[lesson-15-solution]: ./Code_Exercises/Coalesced_Global_Memory/solution.cpp
+
+[lesson-16-slides]: ./Lesson_Materials/Vectors/
+[lesson-16-exercise]: ./Code_Exercises/Vectors/README.md
+[lesson-16-source]:   ./Code_Exercises/Vectors/source.cpp
+[lesson-16-solution]: ./Code_Exercises/Vectors/solution.cpp
+
+[lesson-17-slides]: ./Lesson_Materials/Local_Memory_Tiling/
+[lesson-17-exercise]: ./Code_Exercises/Local_Memory_Tiling/README.md
+[lesson-17-source]:   ./Code_Exercises/Local_Memory_Tiling/source.cpp
+[lesson-17-solution]: ./Code_Exercises/Local_Memory_Tiling/solution.cpp
+
+[lesson-18-slides]: ./Lesson_Materials/Work_Group_Sizes/
+[lesson-18-exercise]: ./Code_Exercises/Work_Group_Sizes/README.md
+[lesson-18-source]:   ./Code_Exercises/Work_Group_Sizes/source.cpp
+[lesson-18-solution]: ./Code_Exercises/Work_Group_Sizes/solution.cpp
+
+[lesson-19-slides]: ./Lesson_Materials/Matrix_Transpose/
+[lesson-19-exercise]: ./Code_Exercises/Matrix_Transpose/README.md
+[lesson-19-source]:   ./Code_Exercises/Matrix_Transpose/source.cpp
+[lesson-19-solution]: ./Code_Exercises/Matrix_Transpose/solution.cpp
+
+[lesson-20-slides]: ./Lesson_Materials/More_SYCL_Features/
+[lesson-20-exercise]: ./Code_Exercises/More_SYCL_Features/README.md
+[lesson-20-source]:   ./Code_Exercises/More_SYCL_Features/source.cpp
+[lesson-20-solution]: ./Code_Exercises/More_SYCL_Features/solution.cpp
+
+[lesson-21-slides]: ./Lesson_Materials/Fast_Track/
+[lesson-21-exercise]: ./Code_Exercises/Functors/README.md
+[lesson-21-source]:   ./Code_Exercises/Functors/source.cpp
+[lesson-21-solution]: ./Code_Exercises/Functors/solution.cpp