From b3ed5a60e4d402ce731a9969c5824bf190ecc3f1 Mon Sep 17 00:00:00 2001
From: Gabriele Merlo <gmerlo@ices.utexas.edu>
Date: Thu, 8 Feb 2024 22:04:32 -0800
Subject: [PATCH 1/7] add support for 7D arrays

---
 include/gtensor/assign.h     | 68 +++++++++++++++++++++++++++++++++++-
 include/gtensor/expression.h |  6 ++++
 include/gtensor/gtensor.h    | 62 ++++++++++++++++++++++++++++++++
 include/gtensor/operator.h   | 32 +++++++++++++++++
 4 files changed, 167 insertions(+), 1 deletion(-)
diff --git a/include/gtensor/assign.h b/include/gtensor/assign.h
index d8f56398..c834ff43 100644
--- a/include/gtensor/assign.h
+++ b/include/gtensor/assign.h
@@ -134,6 +134,32 @@ struct assigner<6, space::host>
   }
 };
 
+
+template <>
+struct assigner<7, space::host>
+{
+  template <typename E1, typename E2>
+  static void run(E1& lhs, const E2& rhs, stream_view stream)
+  {
+    // printf("assigner<7, host>\n");
+    for (int o = 0; o < lhs.shape(6); o++) {
+      for (int n = 0; n < lhs.shape(5); n++) {
+	for (int m = 0; m < lhs.shape(4); m++) {
+	  for (int l = 0; l < lhs.shape(3); l++) {
+	    for (int k = 0; k < lhs.shape(2); k++) {
+	      for (int j = 0; j < lhs.shape(1); j++) {
+		for (int i = 0; i < lhs.shape(0); i++) {
+		  lhs(i, j, k, l, m, n, o) = rhs(i, j, k, l, m, n, o);
+		}
+	      }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
 #if defined(GTENSOR_DEVICE_CUDA) || defined(GTENSOR_DEVICE_HIP)
 
 #ifdef GTENSOR_PER_DIM_KERNELS
@@ -219,6 +245,25 @@ __global__ void kernel_assign_6(Elhs lhs, Erhs _rhs)
   }
 }
 
+template <typename Elhs, typename Erhs>
+__global__ void kernel_assign_7(Elhs lhs, Erhs _rhs)
+{
+  auto rhs = _rhs;
+  int tidx = threadIdx.x + blockIdx.x * blockDim.x;
+  int tidy = threadIdx.y + blockIdx.y * blockDim.y;
+  int tidz = blockIdx.z;
+  if (tidx < lhs.shape(0) * lhs.shape(1) &&
+      tidy < lhs.shape(2) * lhs.shape(3)) {
+    int j = tidx / lhs.shape(0), i = tidx % lhs.shape(0);
+    int l = tidy / lhs.shape(2), k = tidy % lhs.shape(2);
+    int m = tidz % lhs.shape(5) % lhs.shape(4);
+    int n = tidz % lhs.shape(5) / lhs.shape(4),
+    int o = tidz / lhs.shape(5);
+
+    lhs(i, j, k, l, m, n, o) = rhs(i, j, k, l, m, o);
+  }
+}
+
 template <>
 struct assigner<1, space::device>
 {
@@ -305,7 +350,7 @@ struct assigner<5, space::device>
   template <typename E1, typename E2>
   static void run(E1& lhs, const E2& rhs, stream_view stream)
   {
-    // printf("assigner<6, device>\n");
+    // printf("assigner<5, device>\n");
     dim3 numThreads(BS_X, BS_Y);
     dim3 numBlocks((lhs.shape(0) * lhs.shape(1) + BS_X - 1) / BS_X,
                    (lhs.shape(2) * lhs.shape(3) + BS_Y - 1) / BS_Y,
@@ -341,6 +386,27 @@ struct assigner<6, space::device>
   }
 };
 
+template <>
+struct assigner<7, space::device>
+{
+  template <typename E1, typename E2>
+  static void run(E1& lhs, const E2& rhs, stream_view stream)
+  {
+    // printf("assigner<7, device>\n");
+    dim3 numThreads(BS_X, BS_Y);
+    dim3 numBlocks((lhs.shape(0) * lhs.shape(1) + BS_X - 1) / BS_X,
+                   (lhs.shape(2) * lhs.shape(3) + BS_Y - 1) / BS_Y,
+                   lhs.shape(4) * lhs.shape(5) * lhs.shape(6));
+
+    gpuSyncIfEnabledStream(stream);
+    // std::cout << "rhs " << typeid(rhs.to_kernel()).name() << "\n";
+    gtLaunchKernel(kernel_assign_7, numBlocks, numThreads, 0,
+                   stream.get_backend_stream(), lhs.to_kernel(),
+                   rhs.to_kernel());
+    gpuSyncIfEnabledStream(stream);
+  }
+};
+
 #else // not defined GTENSOR_PER_DIM_KERNELS
 
 template <typename Elhs, typename Erhs, size_type N>
diff --git a/include/gtensor/expression.h b/include/gtensor/expression.h
index dc72b036..0c867fc0 100644
--- a/include/gtensor/expression.h
+++ b/include/gtensor/expression.h
@@ -124,6 +124,12 @@ GT_INLINE decltype(auto) index_expression(E&& expr, shape_type<6> idx)
   return expr(idx[0], idx[1], idx[2], idx[3], idx[4], idx[5]);
 }
 
+  template <typename E>
+GT_INLINE decltype(auto) index_expression(E&& expr, shape_type<7> idx)
+{
+  return expr(idx[0], idx[1], idx[2], idx[3], idx[4], idx[5], idx[6]);
+}
+
 } // namespace gt
 
 #endif
diff --git a/include/gtensor/gtensor.h b/include/gtensor/gtensor.h
index 507f0d20..00aca410 100644
--- a/include/gtensor/gtensor.h
+++ b/include/gtensor/gtensor.h
@@ -293,6 +293,28 @@ __global__ void kernel_launch(gt::shape_type<6> shape, F f)
   }
 }
 
+
+template <typename F>
+__global__ void kernel_launch(gt::shape_type<7> shape, F f)
+{
+  int i = threadIdx.x + blockIdx.x * BS_X;
+  int j = threadIdx.y + blockIdx.y * BS_Y;
+  int b = blockIdx.z;
+  int o = b / (shape[2] * shape[3] * shape[4] * shape[5]);
+  b -= o * (shape[2] * shape[3] * shape[4] * hspae[5]);
+  int n = b / (shape[2] * shape[3] * shape[4]);
+  b -= n * (shape[2] * shape[3] * shape[4]);
+  int m = b / (shape[2] * shape[3]);
+  b -= m * (shape[2] * shape[3]);
+  int l = b / shape[2];
+  b -= l * shape[2];
+  int k = b;
+
+  if (i < shape[0] && j < shape[1]) {
+    f(i, j, k, l, m, n, o);
+  }
+}
+
 #else // not GTENSOR_PER_DIM_KERNELS
 
 template <typename F, size_type N>
@@ -418,6 +440,31 @@ struct launch<6, space::host>
   }
 };
 
+
+template <>
+struct launch<7, space::host>
+{
+  template <typename F>
+  static void run(const gt::shape_type<7>& shape, F&& f, gt::stream_view stream)
+  {
+    for (int o = 0; o < shape[6]; o++) {
+      for (int n = 0; n < shape[5]; n++) {
+	for (int m = 0; m < shape[4]; m++) {
+	  for (int l = 0; l < shape[3]; l++) {
+	    for (int k = 0; k < shape[2]; k++) {
+	      for (int j = 0; j < shape[1]; j++) {
+		for (int i = 0; i < shape[0]; i++) {
+		  std::forward<F>(f)(i, j, k, l, m, n, o);
+		}
+	      }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
 #if defined(GTENSOR_DEVICE_CUDA) || defined(GTENSOR_DEVICE_HIP)
 
 #ifdef GTENSOR_PER_DIM_KERNELS
@@ -519,6 +566,21 @@ struct launch<6, space::device>
   }
 };
 
+template <>
+struct launch<7, space::device>
+{
+  template <typename F>
+  static void run(const gt::shape_type<7>& shape, F&& f, gt::stream_view stream)
+  {
+    dim3 numThreads(BS_X, BS_Y);
+    dim3 numBlocks((shape[0] + BS_X - 1) / BS_X, (shape[1] + BS_Y - 1) / BS_Y,
+                   shape[2] * shape[3] * shape[4] * shape[5] * shape[6]);
+
+    gtLaunchKernel(kernel_launch, numBlocks, numThreads, 0,
+                   stream.get_backend_stream(), shape, std::forward<F>(f));
+  }
+};
+
 #else // not GTENSOR_PER_DIM_KERNELS
 
 template <int N>
diff --git a/include/gtensor/operator.h b/include/gtensor/operator.h
index 774dc6d6..df0acea2 100644
--- a/include/gtensor/operator.h
+++ b/include/gtensor/operator.h
@@ -355,6 +355,38 @@ struct equals<6, 6, space::host, space::host>
   }
 };
 
+
+template <>
+struct equals<7, 7, space::host, space::host>
+{
+  template <typename E1, typename E2>
+  static bool run(const E1& e1, const E2& e2)
+  {
+    if (e1.shape() != e2.shape()) {
+      return false;
+    }
+
+    for (int v = 0; v < e1.shape(6); v++) {
+      for (int z = 0; z < e1.shape(5); z++) {
+	for (int y = 0; y < e1.shape(4); y++) {
+	  for (int x = 0; x < e1.shape(3); x++) {
+	    for (int k = 0; k < e1.shape(2); k++) {
+	      for (int j = 0; j < e1.shape(1); j++) {
+		for (int i = 0; i < e1.shape(0); i++) {
+		  if (e1(i, j, k, x, y, z, v) != e2(i, j, k, x, y, z, v)) {
+		    return false;
+		  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    return true;
+  }
+};
+
 #ifdef GTENSOR_HAVE_DEVICE
 
 template <size_type N1, size_type N2>

From 14388cc882b03f1884d68e7ff837574c477a012a Mon Sep 17 00:00:00 2001
From: gabriele merlo <gmerlo00@login01.leonardo.local>
Date: Sat, 31 Aug 2024 16:07:16 +0200
Subject: [PATCH 2/7] run clang-format

---
 include/gtensor/assign.h     | 20 +++++++++-----------
 include/gtensor/expression.h |  2 +-
 include/gtensor/gtensor.h    | 18 ++++++++----------
 include/gtensor/operator.h   | 17 ++++++++---------
 tests/test_launch.cxx        | 11 ++++++-----
 5 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/include/gtensor/assign.h b/include/gtensor/assign.h
index c834ff43..947c9b72 100644
--- a/include/gtensor/assign.h
+++ b/include/gtensor/assign.h
@@ -134,7 +134,6 @@ struct assigner<6, space::host>
   }
 };
 
-
 template <>
 struct assigner<7, space::host>
 {
@@ -144,14 +143,14 @@ struct assigner<7, space::host>
     // printf("assigner<7, host>\n");
     for (int o = 0; o < lhs.shape(6); o++) {
       for (int n = 0; n < lhs.shape(5); n++) {
-	for (int m = 0; m < lhs.shape(4); m++) {
-	  for (int l = 0; l < lhs.shape(3); l++) {
-	    for (int k = 0; k < lhs.shape(2); k++) {
-	      for (int j = 0; j < lhs.shape(1); j++) {
-		for (int i = 0; i < lhs.shape(0); i++) {
-		  lhs(i, j, k, l, m, n, o) = rhs(i, j, k, l, m, n, o);
-		}
-	      }
+        for (int m = 0; m < lhs.shape(4); m++) {
+          for (int l = 0; l < lhs.shape(3); l++) {
+            for (int k = 0; k < lhs.shape(2); k++) {
+              for (int j = 0; j < lhs.shape(1); j++) {
+                for (int i = 0; i < lhs.shape(0); i++) {
+                  lhs(i, j, k, l, m, n, o) = rhs(i, j, k, l, m, n, o);
+                }
+              }
             }
           }
         }
@@ -257,8 +256,7 @@ __global__ void kernel_assign_7(Elhs lhs, Erhs _rhs)
     int j = tidx / lhs.shape(0), i = tidx % lhs.shape(0);
     int l = tidy / lhs.shape(2), k = tidy % lhs.shape(2);
     int m = tidz % lhs.shape(5) % lhs.shape(4);
-    int n = tidz % lhs.shape(5) / lhs.shape(4),
-    int o = tidz / lhs.shape(5);
+    int n = tidz % lhs.shape(5) / lhs.shape(4), int o = tidz / lhs.shape(5);
 
     lhs(i, j, k, l, m, n, o) = rhs(i, j, k, l, m, o);
   }
diff --git a/include/gtensor/expression.h b/include/gtensor/expression.h
index 0c867fc0..c3580254 100644
--- a/include/gtensor/expression.h
+++ b/include/gtensor/expression.h
@@ -124,7 +124,7 @@ GT_INLINE decltype(auto) index_expression(E&& expr, shape_type<6> idx)
   return expr(idx[0], idx[1], idx[2], idx[3], idx[4], idx[5]);
 }
 
-  template <typename E>
+template <typename E>
 GT_INLINE decltype(auto) index_expression(E&& expr, shape_type<7> idx)
 {
   return expr(idx[0], idx[1], idx[2], idx[3], idx[4], idx[5], idx[6]);
diff --git a/include/gtensor/gtensor.h b/include/gtensor/gtensor.h
index 970736d5..a7d5637d 100644
--- a/include/gtensor/gtensor.h
+++ b/include/gtensor/gtensor.h
@@ -303,7 +303,6 @@ __global__ void kernel_launch(gt::shape_type<6> shape, F f)
   }
 }
 
-
 template <typename F>
 __global__ void kernel_launch(gt::shape_type<7> shape, F f)
 {
@@ -450,7 +449,6 @@ struct launch<6, space::host>
   }
 };
 
-
 template <>
 struct launch<7, space::host>
 {
@@ -459,14 +457,14 @@ struct launch<7, space::host>
   {
     for (int o = 0; o < shape[6]; o++) {
       for (int n = 0; n < shape[5]; n++) {
-	for (int m = 0; m < shape[4]; m++) {
-	  for (int l = 0; l < shape[3]; l++) {
-	    for (int k = 0; k < shape[2]; k++) {
-	      for (int j = 0; j < shape[1]; j++) {
-		for (int i = 0; i < shape[0]; i++) {
-		  std::forward<F>(f)(i, j, k, l, m, n, o);
-		}
-	      }
+        for (int m = 0; m < shape[4]; m++) {
+          for (int l = 0; l < shape[3]; l++) {
+            for (int k = 0; k < shape[2]; k++) {
+              for (int j = 0; j < shape[1]; j++) {
+                for (int i = 0; i < shape[0]; i++) {
+                  std::forward<F>(f)(i, j, k, l, m, n, o);
+                }
+              }
             }
           }
         }
diff --git a/include/gtensor/operator.h b/include/gtensor/operator.h
index df0acea2..3b8bb5f7 100644
--- a/include/gtensor/operator.h
+++ b/include/gtensor/operator.h
@@ -355,7 +355,6 @@ struct equals<6, 6, space::host, space::host>
   }
 };
 
-
 template <>
 struct equals<7, 7, space::host, space::host>
 {
@@ -368,14 +367,14 @@ struct equals<7, 7, space::host, space::host>
 
     for (int v = 0; v < e1.shape(6); v++) {
       for (int z = 0; z < e1.shape(5); z++) {
-	for (int y = 0; y < e1.shape(4); y++) {
-	  for (int x = 0; x < e1.shape(3); x++) {
-	    for (int k = 0; k < e1.shape(2); k++) {
-	      for (int j = 0; j < e1.shape(1); j++) {
-		for (int i = 0; i < e1.shape(0); i++) {
-		  if (e1(i, j, k, x, y, z, v) != e2(i, j, k, x, y, z, v)) {
-		    return false;
-		  }
+        for (int y = 0; y < e1.shape(4); y++) {
+          for (int x = 0; x < e1.shape(3); x++) {
+            for (int k = 0; k < e1.shape(2); k++) {
+              for (int j = 0; j < e1.shape(1); j++) {
+                for (int i = 0; i < e1.shape(0); i++) {
+                  if (e1(i, j, k, x, y, z, v) != e2(i, j, k, x, y, z, v)) {
+                    return false;
+                  }
                 }
               }
             }
diff --git a/tests/test_launch.cxx b/tests/test_launch.cxx
index 928c1b3a..0b86cb95 100644
--- a/tests/test_launch.cxx
+++ b/tests/test_launch.cxx
@@ -193,8 +193,9 @@ void device_double_add_7d(gt::gtensor_device<double, 7>& a,
   auto k_b = b.to_kernel();
 
   gt::launch<7>(
-		a.shape(), GT_LAMBDA(int i, int j, int k, int l, int m, int n, int o) {
-		  k_b(i, j, k, l, m, n, o) = k_a(i, j, k, l, m, n, o) + k_a(i, j, k, l, m, n, o);
+    a.shape(), GT_LAMBDA(int i, int j, int k, int l, int m, int n, int o) {
+      k_b(i, j, k, l, m, n, o) =
+        k_a(i, j, k, l, m, n, o) + k_a(i, j, k, l, m, n, o);
     });
   gt::copy(b, out);
 }
@@ -270,9 +271,9 @@ TEST(gtensor, device_launch_7d)
         for (int l = 0; l < h_a.shape(3); l++) {
           for (int m = 0; m < h_a.shape(4); m++) {
             for (int n = 0; n < h_a.shape(5); n++) {
-	      for (int o = 0; n < h_a.shape(6); o++) {
-		h_a(i, j, k, l, m, n, o) = i + j + k + l + m + n +o;
-	      }
+              for (int o = 0; n < h_a.shape(6); o++) {
+                h_a(i, j, k, l, m, n, o) = i + j + k + l + m + n + o;
+              }
             }
           }
         }

From 66a58a7efceed89d219f7659e522e329becb06c1 Mon Sep 17 00:00:00 2001
From: gabriele merlo <gmerlo00@login07.leonardo.local>
Date: Mon, 2 Sep 2024 08:43:12 +0200
Subject: [PATCH 3/7] use newline instead of commas

---
 include/gtensor/assign.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/gtensor/assign.h b/include/gtensor/assign.h
index 947c9b72..cc622468 100644
--- a/include/gtensor/assign.h
+++ b/include/gtensor/assign.h
@@ -253,10 +253,13 @@ __global__ void kernel_assign_7(Elhs lhs, Erhs _rhs)
   int tidz = blockIdx.z;
   if (tidx < lhs.shape(0) * lhs.shape(1) &&
       tidy < lhs.shape(2) * lhs.shape(3)) {
-    int j = tidx / lhs.shape(0), i = tidx % lhs.shape(0);
-    int l = tidy / lhs.shape(2), k = tidy % lhs.shape(2);
+    int j = tidx / lhs.shape(0);
+    int i = tidx % lhs.shape(0);
+    int l = tidy / lhs.shape(2);
+    int k = tidy % lhs.shape(2);
+    int n = tidz % lhs.shape(5) / lhs.shape(4);
     int m = tidz % lhs.shape(5) % lhs.shape(4);
-    int n = tidz % lhs.shape(5) / lhs.shape(4), int o = tidz / lhs.shape(5);
+    int o = tidz / lhs.shape(5);
 
     lhs(i, j, k, l, m, n, o) = rhs(i, j, k, l, m, o);
   }

From 5757f63e268a5155166a103de230b91a488152da Mon Sep 17 00:00:00 2001
From: gabriele merlo <gmerlo00@login07.leonardo.local>
Date: Mon, 2 Sep 2024 08:43:32 +0200
Subject: [PATCH 4/7] Add tests for assign 7d

---
 tests/test_assign.cxx | 600 ++++++++++++++++++++++++------------------
 1 file changed, 338 insertions(+), 262 deletions(-)

diff --git a/tests/test_assign.cxx b/tests/test_assign.cxx
index 6d43111d..ff63ac5f 100644
--- a/tests/test_assign.cxx
+++ b/tests/test_assign.cxx
@@ -23,6 +23,22 @@ TEST(assign, gtensor_6d)
   EXPECT_EQ(a, b);
 }
 
+TEST(assign, gtensor_7d)
+{
+  gt::gtensor<int, 7> a(gt::shape(2, 3, 4, 5, 6, 7, 8));
+  gt::gtensor<int, 7> b(a.shape());
+
+  int* adata = a.data();
+
+  for (int i = 0; i < a.size(); i++) {
+    adata[i] = i;
+  }
+
+  EXPECT_NE(a, b);
+  b = a;
+  EXPECT_EQ(a, b);
+}
+
 TEST(assign, gview_1d_scalar)
 {
   auto a = gt::empty<int>(gt::shape(5));
@@ -81,326 +97,386 @@ TEST(assign, broadcast_6d)
   }
 }
 
-#ifdef GTENSOR_HAVE_DEVICE
-
-TEST(assign, device_gtensor_6d)
+TEST(assign, broadcast_7d)
 {
-  gt::gtensor_device<int, 6> a(gt::shape(2, 3, 4, 5, 6, 7));
-  gt::gtensor_device<int, 6> b(a.shape());
-  gt::gtensor<int, 6> h_a(a.shape());
-  gt::gtensor<int, 6> h_b(a.shape());
+  gt::gtensor<int, 7> a(gt::shape(8, 1, 2, 4, 1, 1, 6), 0);
+  gt::gtensor<int, 7> b(gt::shape(8, 1, 2, 1, 1, 1, 6), -7);
 
-  int* adata = h_a.data();
+  gt::assign(a, b);
 
-  for (int i = 0; i < a.size(); i++) {
-    adata[i] = i;
+  for (int i = 0; i < a.shape(0); i++) {
+    for (int j = 0; j < a.shape(2); j++) {
+      for (int k = 0; k < a.shape(3); k++) {
+        for (int l = 0; l < a.shape(6); l++) {
+          EXPECT_EQ(a(i, 0, j, k, 0, 0, l), -7);
+        }
+      }
+    }
   }
 
-  gt::copy(h_a, a);
-  b = a;
-  gt::copy(b, h_b);
+#ifdef GTENSOR_HAVE_DEVICE
 
-  EXPECT_EQ(h_a, h_b);
-}
+  TEST(assign, device_gtensor_6d)
+  {
+    gt::gtensor_device<int, 6> a(gt::shape(2, 3, 4, 5, 6, 7));
+    gt::gtensor_device<int, 6> b(a.shape());
+    gt::gtensor<int, 6> h_a(a.shape());
+    gt::gtensor<int, 6> h_b(a.shape());
 
-TEST(assign, device_gtensor_fill)
-{
-  gt::gtensor_device<float, 2> a(gt::shape(2, 3));
-  gt::gtensor<float, 2> h_a(a.shape());
+    int* adata = h_a.data();
 
-  a.fill(9001);
+    for (int i = 0; i < a.size(); i++) {
+      adata[i] = i;
+    }
 
-  gt::copy(a, h_a);
-  EXPECT_EQ(h_a, (gt::gtensor_device<float, 2>{
-                   {9001, 9001}, {9001, 9001}, {9001, 9001}}));
-}
+    gt::copy(h_a, a);
+    b = a;
+    gt::copy(b, h_b);
 
-TEST(assign, device_gview_fill)
-{
-  gt::gtensor_device<float, 2> a(gt::shape(2, 3));
-  gt::gtensor<float, 2> h_a(a.shape());
-  auto av = a.view(gt::all, gt::all);
+    EXPECT_EQ(h_a, h_b);
+  }
 
-  av.fill(9001);
+  TEST(assign, device_gtensor_7d)
+  {
+    gt::gtensor_device<int, 7> a(gt::shape(2, 3, 4, 5, 6, 7, 8));
+    gt::gtensor_device<int, 7> b(a.shape());
+    gt::gtensor<int, 7> h_a(a.shape());
+    gt::gtensor<int, 7> h_b(a.shape());
 
-  gt::copy(a, h_a);
-  EXPECT_EQ(h_a,
-            (gt::gtensor<float, 2>{{9001, 9001}, {9001, 9001}, {9001, 9001}}));
-}
+    int* adata = h_a.data();
 
-TEST(assign, device_span_fill)
-{
-  gt::gtensor_device<float, 2> a(gt::shape(2, 3));
-  gt::gtensor<float, 2> h_a(a.shape());
-  auto as = a.to_kernel();
+    for (int i = 0; i < a.size(); i++) {
+      adata[i] = i;
+    }
 
-  as.fill(9001);
+    gt::copy(h_a, a);
+    b = a;
+    gt::copy(b, h_b);
 
-  gt::copy(a, h_a);
-  EXPECT_EQ(h_a,
-            (gt::gtensor<float, 2>{{9001, 9001}, {9001, 9001}, {9001, 9001}}));
-}
+    EXPECT_EQ(h_a, h_b);
+  }
 
-TEST(assign, device_gview_1d_scalar)
-{
-  auto a = gt::empty_device<int>(gt::shape(5));
-  auto h_a = gt::empty_like(a);
-  auto aview = a.view(gt::all);
+  TEST(assign, device_gtensor_fill)
+  {
+    gt::gtensor_device<float, 2> a(gt::shape(2, 3));
+    gt::gtensor<float, 2> h_a(a.shape());
 
-  aview = gt::scalar(5);
+    a.fill(9001);
+
+    gt::copy(a, h_a);
+    EXPECT_EQ(h_a, (gt::gtensor_device<float, 2>{
+                     {9001, 9001}, {9001, 9001}, {9001, 9001}}));
+  }
 
-  gt::copy(a, h_a);
+  TEST(assign, device_gview_fill)
+  {
+    gt::gtensor_device<float, 2> a(gt::shape(2, 3));
+    gt::gtensor<float, 2> h_a(a.shape());
+    auto av = a.view(gt::all, gt::all);
 
-  EXPECT_EQ(h_a, (gt::gtensor<int, 1>{5, 5, 5, 5, 5}));
-}
+    av.fill(9001);
 
-TEST(assign, device_gtensor_large_2d)
-{
-  gt::gtensor_device<int, 2> a(gt::shape(2, 17920000));
-  gt::gtensor_device<int, 2> b(a.shape());
-  gt::gtensor<int, 2> h_a(a.shape());
-  gt::gtensor<int, 2> h_b(a.shape());
+    gt::copy(a, h_a);
+    EXPECT_EQ(
+      h_a, (gt::gtensor<float, 2>{{9001, 9001}, {9001, 9001}, {9001, 9001}}));
+  }
 
-  int* adata = h_a.data();
+  TEST(assign, device_span_fill)
+  {
+    gt::gtensor_device<float, 2> a(gt::shape(2, 3));
+    gt::gtensor<float, 2> h_a(a.shape());
+    auto as = a.to_kernel();
 
-  for (int i = 0; i < a.size(); i++) {
-    adata[i] = i;
+    as.fill(9001);
+
+    gt::copy(a, h_a);
+    EXPECT_EQ(
+      h_a, (gt::gtensor<float, 2>{{9001, 9001}, {9001, 9001}, {9001, 9001}}));
   }
 
-  gt::copy(h_a, a);
-  // NB: b = a calls the default operator= which ends up triggering
-  // and underlying storage vector copy, usually a device memcpy, so
-  // it doesn't launch the gtensor assign kernel. Call assign directly
-  // to exercise the code
-  assign(b, a);
-  gt::copy(b, h_b);
+  TEST(assign, device_gview_1d_scalar)
+  {
+    auto a = gt::empty_device<int>(gt::shape(5));
+    auto h_a = gt::empty_like(a);
+    auto aview = a.view(gt::all);
 
-  EXPECT_EQ(h_a, h_b);
-}
+    aview = gt::scalar(5);
 
-TEST(assign, device_view_noncontiguous_6d)
-{
-  using T = gt::complex<double>;
-
-  int nzb = 2;
-  int nvb = 2;
-  int nwb = 2;
-
-  // ijklmn, no ghost
-  auto g_shape = gt::shape(32, 4, 48, 40, 30, 2);
-
-  // ijklmn, ghost in z, v, w
-  auto f_shape =
-    gt::shape(g_shape[0], g_shape[1], g_shape[2] + 2 * nzb,
-              g_shape[3] + 2 * nvb, g_shape[4] + 2 * nwb, g_shape[5]);
-  // i klmn, no ghost
-  auto papbar_shape =
-    gt::shape(g_shape[0], g_shape[2], g_shape[3], g_shape[4], g_shape[5]);
-  // ijz mn, ghost in z
-  auto bar_apar_shape =
-    gt::shape(g_shape[0], g_shape[1], f_shape[2], g_shape[4], g_shape[5]);
-  auto h_g = gt::full(g_shape, T(2.0));
-  auto d_g = gt::empty_device<T>(g_shape);
-  auto h_papbar = gt::full(papbar_shape, T(1.5));
-  auto d_papbar = gt::empty_device<T>(papbar_shape);
-  auto h_bar_apar = gt::full(bar_apar_shape, T(0.0, -1.0));
-  auto d_bar_apar = gt::empty_device<T>(bar_apar_shape);
-  auto h_f = gt::full<T>(f_shape, T(100.0));
-  auto d_f = gt::empty_device<T>(f_shape);
-
-  gt::copy(h_g, d_g);
-  gt::copy(h_papbar, d_papbar);
-  gt::copy(h_bar_apar, d_bar_apar);
-  gt::copy(h_f, d_f);
-
-  auto lhs_view = d_f.view(gt::all, gt::all, gt::slice(nzb, -nzb),
-                           gt::slice(nvb, -nvb), gt::slice(nwb, -nwb), gt::all);
-  auto d_papbar_view =
-    d_papbar.view(gt::all, gt::newaxis, gt::all, gt::all, gt::all, gt::all);
-  auto d_bar_apar_view = d_bar_apar.view(gt::all, gt::all, gt::slice(nzb, -nzb),
-                                         gt::newaxis, gt::all, gt::all);
-  auto rhs_view = d_g + d_papbar_view * d_bar_apar_view;
-
-  GT_DEBUG_VAR(d_g.shape());
-  GT_DEBUG_VAR(d_papbar_view.shape());
-  GT_DEBUG_VAR(d_bar_apar_view.shape());
-  GT_DEBUG_VAR(lhs_view.shape());
-  GT_DEBUG_VAR(rhs_view.shape());
-
-  lhs_view = rhs_view;
-
-  /*
-  d_f.view(gt::all, gt::all, gt::slice(nzb, -nzb), gt::slice(nvb, -nvb),
-  gt::slice(nwb, -nwb), gt::all) = d_g + d_papbar.view(gt::all, gt::newaxis,
-  gt::all, gt::all, gt::all, gt::all) * d_bar_apar.view(gt::all, gt::all,
-  gt::slice(nzb, -nzb), gt::newaxis, gt::all, gt::all);
-  */
-
-  gt::copy(d_f, h_f);
-
-  // spot check boundary, not changed
-  EXPECT_EQ(h_f(3, 3, nzb - 1, nvb, nwb, 1), T(100.0));
-  EXPECT_EQ(h_f(3, 3, nzb, nvb - 1, nwb, 1), T(100.0));
-  EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb - 1, 1), T(100.0));
-  EXPECT_EQ(
-    h_f(0, 1, f_shape[2] - nzb, f_shape[3] - nvb - 1, f_shape[4] - nwb - 1, 1),
-    T(100.0));
-  EXPECT_EQ(
-    h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb, f_shape[4] - nwb - 1, 1),
-    T(100.0));
-  EXPECT_EQ(
-    h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb - 1, f_shape[4] - nwb, 1),
-    T(100.0));
-
-  // spot check inside that was changed
-  EXPECT_EQ(h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb - 1,
-                f_shape[4] - nwb - 1, 1),
-            T(2.0, -1.5));
-  EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb, 1), T(2.0, -1.5));
-
-  gt::synchronize();
-}
+    gt::copy(a, h_a);
 
-TEST(assign, device_view_noncontiguous_6d_scalar)
-{
-  using T = gt::complex<double>;
+    EXPECT_EQ(h_a, (gt::gtensor<int, 1>{5, 5, 5, 5, 5}));
+  }
 
-  int nzb = 2;
-  int nvb = 2;
-  int nwb = 0;
+  TEST(assign, device_gtensor_large_2d)
+  {
+    gt::gtensor_device<int, 2> a(gt::shape(2, 17920000));
+    gt::gtensor_device<int, 2> b(a.shape());
+    gt::gtensor<int, 2> h_a(a.shape());
+    gt::gtensor<int, 2> h_b(a.shape());
 
-  // ijklmn, ghost in z, v, w
-  auto f_shape = gt::shape(5, 7, 9, 11, 13, 2);
+    int* adata = h_a.data();
 
-  auto h_f = gt::full<T>(f_shape, T(100.0));
-  auto d_f = gt::empty_device<T>(f_shape);
+    for (int i = 0; i < a.size(); i++) {
+      adata[i] = i;
+    }
 
-  gt::copy(h_f, d_f);
+    gt::copy(h_a, a);
+    // NB: b = a calls the default operator= which ends up triggering
+    // and underlying storage vector copy, usually a device memcpy, so
+    // it doesn't launch the gtensor assign kernel. Call assign directly
+    // to exercise the code
+    assign(b, a);
+    gt::copy(b, h_b);
 
-  auto f_size = d_f.size();
-  GT_DEBUG_VAR(d_f.shape());
-  GT_DEBUG_VAR(f_size);
+    EXPECT_EQ(h_a, h_b);
+  }
 
-  auto d_f_noghost =
+  TEST(assign, device_view_noncontiguous_6d)
+  {
+    using T = gt::complex<double>;
+
+    int nzb = 2;
+    int nvb = 2;
+    int nwb = 2;
+
+    // ijklmn, no ghost
+    auto g_shape = gt::shape(32, 4, 48, 40, 30, 2);
+
+    // ijklmn, ghost in z, v, w
+    auto f_shape =
+      gt::shape(g_shape[0], g_shape[1], g_shape[2] + 2 * nzb,
+                g_shape[3] + 2 * nvb, g_shape[4] + 2 * nwb, g_shape[5]);
+    // i klmn, no ghost
+    auto papbar_shape =
+      gt::shape(g_shape[0], g_shape[2], g_shape[3], g_shape[4], g_shape[5]);
+    // ijz mn, ghost in z
+    auto bar_apar_shape =
+      gt::shape(g_shape[0], g_shape[1], f_shape[2], g_shape[4], g_shape[5]);
+    auto h_g = gt::full(g_shape, T(2.0));
+    auto d_g = gt::empty_device<T>(g_shape);
+    auto h_papbar = gt::full(papbar_shape, T(1.5));
+    auto d_papbar = gt::empty_device<T>(papbar_shape);
+    auto h_bar_apar = gt::full(bar_apar_shape, T(0.0, -1.0));
+    auto d_bar_apar = gt::empty_device<T>(bar_apar_shape);
+    auto h_f = gt::full<T>(f_shape, T(100.0));
+    auto d_f = gt::empty_device<T>(f_shape);
+
+    gt::copy(h_g, d_g);
+    gt::copy(h_papbar, d_papbar);
+    gt::copy(h_bar_apar, d_bar_apar);
+    gt::copy(h_f, d_f);
+
+    auto lhs_view =
+      d_f.view(gt::all, gt::all, gt::slice(nzb, -nzb), gt::slice(nvb, -nvb),
+               gt::slice(nwb, -nwb), gt::all);
+    auto d_papbar_view =
+      d_papbar.view(gt::all, gt::newaxis, gt::all, gt::all, gt::all, gt::all);
+    auto d_bar_apar_view = d_bar_apar.view(
+      gt::all, gt::all, gt::slice(nzb, -nzb), gt::newaxis, gt::all, gt::all);
+    auto rhs_view = d_g + d_papbar_view * d_bar_apar_view;
+
+    GT_DEBUG_VAR(d_g.shape());
+    GT_DEBUG_VAR(d_papbar_view.shape());
+    GT_DEBUG_VAR(d_bar_apar_view.shape());
+    GT_DEBUG_VAR(lhs_view.shape());
+    GT_DEBUG_VAR(rhs_view.shape());
+
+    lhs_view = rhs_view;
+
+    /*
     d_f.view(gt::all, gt::all, gt::slice(nzb, -nzb), gt::slice(nvb, -nvb),
-             gt::slice(nwb, -nwb), gt::all);
-  auto f_noghost_size = d_f_noghost.size();
-  GT_DEBUG_TYPE(d_f_noghost);
-  GT_DEBUG_VAR(d_f_noghost.shape());
-  GT_DEBUG_VAR(f_noghost_size);
-
-  d_f_noghost = 1.0;
-
-  gt::copy(d_f, h_f);
-
-  // spot check boundary, not changed
-  EXPECT_EQ(h_f(3, 3, nzb - 1, nvb, nwb, 1), T(100.0));
-  EXPECT_EQ(h_f(3, 3, nzb, nvb - 1, nwb, 1), T(100.0));
-  EXPECT_EQ(
-    h_f(0, 1, f_shape[2] - nzb, f_shape[3] - nvb - 1, f_shape[4] - nwb - 1, 1),
-    T(100.0));
-  EXPECT_EQ(
-    h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb, f_shape[4] - nwb - 1, 1),
-    T(100.0));
-
-  // note: interior, since nwb == 0
-  EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb, 1), T(1.0));
-  EXPECT_EQ(
-    h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb - 1, f_shape[4] - 1, 1),
-    T(1.0));
-
-  // spot check inside that was changed
-  EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb, 1), T(1.0));
-
-  gt::synchronize();
-}
+    gt::slice(nwb, -nwb), gt::all) = d_g + d_papbar.view(gt::all, gt::newaxis,
+    gt::all, gt::all, gt::all, gt::all) * d_bar_apar.view(gt::all, gt::all,
+    gt::slice(nzb, -nzb), gt::newaxis, gt::all, gt::all);
+    */
+
+    gt::copy(d_f, h_f);
+
+    // spot check boundary, not changed
+    EXPECT_EQ(h_f(3, 3, nzb - 1, nvb, nwb, 1), T(100.0));
+    EXPECT_EQ(h_f(3, 3, nzb, nvb - 1, nwb, 1), T(100.0));
+    EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb - 1, 1), T(100.0));
+    EXPECT_EQ(h_f(0, 1, f_shape[2] - nzb, f_shape[3] - nvb - 1,
+                  f_shape[4] - nwb - 1, 1),
+              T(100.0));
+    EXPECT_EQ(h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb,
+                  f_shape[4] - nwb - 1, 1),
+              T(100.0));
+    EXPECT_EQ(h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb - 1,
+                  f_shape[4] - nwb, 1),
+              T(100.0));
+
+    // spot check inside that was changed
+    EXPECT_EQ(h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb - 1,
+                  f_shape[4] - nwb - 1, 1),
+              T(2.0, -1.5));
+    EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb, 1), T(2.0, -1.5));
+
+    gt::synchronize();
+  }
 
-TEST(assign, device_gfunction_mismatch_throw)
-{
-  using T = gt::complex<double>;
+  TEST(assign, device_view_noncontiguous_6d_scalar)
+  {
+    using T = gt::complex<double>;
 
-  int nzb = 2;
-  int nvb = 2;
-  int nwb = 0;
+    int nzb = 2;
+    int nvb = 2;
+    int nwb = 0;
 
-  // ijklmn, ghost in z, v, w
-  auto f_shape = gt::shape(5, 7, 9, 11, 13, 2);
+    // ijklmn, ghost in z, v, w
+    auto f_shape = gt::shape(5, 7, 9, 11, 13, 2);
 
-  auto g_shape =
-    gt::shape(f_shape[0], f_shape[1], f_shape[2] - 2 * nzb,
-              f_shape[3] - 2 * nvb, f_shape[4] - 2 * nwb, f_shape[5]);
+    auto h_f = gt::full<T>(f_shape, T(100.0));
+    auto d_f = gt::empty_device<T>(f_shape);
 
-  auto h_f = gt::full<T>(f_shape, T(100.0));
-  auto d_f = gt::empty_device<T>(f_shape);
-  auto h_g = gt::full(g_shape, T(2.0));
-  auto d_g = gt::empty_device<T>(g_shape);
+    gt::copy(h_f, d_f);
 
-  gt::copy(h_f, d_f);
-  gt::copy(h_g, d_g);
+    auto f_size = d_f.size();
+    GT_DEBUG_VAR(d_f.shape());
+    GT_DEBUG_VAR(f_size);
 
-  EXPECT_THROW(h_g + h_f, std::runtime_error);
-  EXPECT_THROW(d_g + d_f, std::runtime_error);
-}
+    auto d_f_noghost =
+      d_f.view(gt::all, gt::all, gt::slice(nzb, -nzb), gt::slice(nvb, -nvb),
+               gt::slice(nwb, -nwb), gt::all);
+    auto f_noghost_size = d_f_noghost.size();
+    GT_DEBUG_TYPE(d_f_noghost);
+    GT_DEBUG_VAR(d_f_noghost.shape());
+    GT_DEBUG_VAR(f_noghost_size);
 
-namespace test
-{
-template <typename T, gt::size_type N>
-using gtensor_managed = gt::gtensor_container<gt::space::managed_vector<T>, N>;
-} // end namespace test
+    d_f_noghost = 1.0;
 
-TEST(assign, device_gene_h_from_f)
-{
-  using T = gt::complex<double>;
+    gt::copy(d_f, h_f);
 
-  const int nwb = 0;
+    // spot check boundary, not changed
+    EXPECT_EQ(h_f(3, 3, nzb - 1, nvb, nwb, 1), T(100.0));
+    EXPECT_EQ(h_f(3, 3, nzb, nvb - 1, nwb, 1), T(100.0));
+    EXPECT_EQ(h_f(0, 1, f_shape[2] - nzb, f_shape[3] - nvb - 1,
+                  f_shape[4] - nwb - 1, 1),
+              T(100.0));
+    EXPECT_EQ(h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb,
+                  f_shape[4] - nwb - 1, 1),
+              T(100.0));
 
-  // ijklmn, ghost in w
-  auto hdist_shape = gt::shape(32, 4, 48, 40, 30 + 2 * nwb, 2);
-  auto fdist_shape = gt::shape(32, 4, 48, 40, 30 + 2 * nwb, 2);
+    // note: interior, since nwb == 0
+    EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb, 1), T(1.0));
+    EXPECT_EQ(
+      h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb - 1, f_shape[4] - 1, 1),
+      T(1.0));
 
-  // ijklmn, no ghost
-  auto prefac_shape = gt::shape(32, 4, 48, 40, 30, 2);
+    // spot check inside that was changed
+    EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb, 1), T(1.0));
 
-  // ijk + ? axis
-  auto phi_shape = gt::shape(32, 4, 48, 2);
+    gt::synchronize();
+  }
 
-  test::gtensor_managed<T, 6> hdist(hdist_shape, T(2.0));
-  test::gtensor_managed<T, 6> fdist(fdist_shape, T(1.0));
-  test::gtensor_managed<T, 6> prefac(prefac_shape, T(-1.0));
-  test::gtensor_managed<T, 4> phi(phi_shape, T(-2.0));
-  gt::gtensor<T, 6> expected(hdist_shape, T(3.0));
+  TEST(assign, device_gfunction_mismatch_throw)
+  {
+    using T = gt::complex<double>;
 
-  hdist.view(gt::all, gt::all, gt::all, gt::all, gt::slice(nwb, -nwb),
-             gt::all) =
-    fdist.view(gt::all, gt::all, gt::all, gt::all, gt::slice(nwb, -nwb),
-               gt::all) +
-    prefac * phi.view(gt::all, gt::all, gt::all, 0, gt::newaxis, gt::newaxis,
-                      gt::newaxis);
+    int nzb = 2;
+    int nvb = 2;
+    int nwb = 0;
 
-  gt::synchronize();
+    // ijklmn, ghost in z, v, w
+    auto f_shape = gt::shape(5, 7, 9, 11, 13, 2);
 
-  // spot check
-  EXPECT_EQ(hdist, expected);
-}
+    auto g_shape =
+      gt::shape(f_shape[0], f_shape[1], f_shape[2] - 2 * nzb,
+                f_shape[3] - 2 * nvb, f_shape[4] - 2 * nwb, f_shape[5]);
 
-TEST(assign, device_broadcast_6d)
-{
-  gt::gtensor_device<int, 6> a(gt::shape(8, 1, 2, 4, 1, 1), 0);
-  gt::gtensor_device<int, 6> b(gt::shape(8, 1, 2, 1, 1, 1), -7);
+    auto h_f = gt::full<T>(f_shape, T(100.0));
+    auto d_f = gt::empty_device<T>(f_shape);
+    auto h_g = gt::full(g_shape, T(2.0));
+    auto d_g = gt::empty_device<T>(g_shape);
 
-  gt::gtensor<int, 6> h_a(a.shape());
+    gt::copy(h_f, d_f);
+    gt::copy(h_g, d_g);
 
-  gt::assign(a, b);
+    EXPECT_THROW(h_g + h_f, std::runtime_error);
+    EXPECT_THROW(d_g + d_f, std::runtime_error);
+  }
+
+  namespace test
+  {
+  template <typename T, gt::size_type N>
+  using gtensor_managed =
+    gt::gtensor_container<gt::space::managed_vector<T>, N>;
+  } // end namespace test
+
+  TEST(assign, device_gene_h_from_f)
+  {
+    using T = gt::complex<double>;
+
+    const int nwb = 0;
+
+    // ijklmn, ghost in w
+    auto hdist_shape = gt::shape(32, 4, 48, 40, 30 + 2 * nwb, 2);
+    auto fdist_shape = gt::shape(32, 4, 48, 40, 30 + 2 * nwb, 2);
+
+    // ijklmn, no ghost
+    auto prefac_shape = gt::shape(32, 4, 48, 40, 30, 2);
+
+    // ijk + ? axis
+    auto phi_shape = gt::shape(32, 4, 48, 2);
+
+    test::gtensor_managed<T, 6> hdist(hdist_shape, T(2.0));
+    test::gtensor_managed<T, 6> fdist(fdist_shape, T(1.0));
+    test::gtensor_managed<T, 6> prefac(prefac_shape, T(-1.0));
+    test::gtensor_managed<T, 4> phi(phi_shape, T(-2.0));
+    gt::gtensor<T, 6> expected(hdist_shape, T(3.0));
+
+    hdist.view(gt::all, gt::all, gt::all, gt::all, gt::slice(nwb, -nwb),
+               gt::all) =
+      fdist.view(gt::all, gt::all, gt::all, gt::all, gt::slice(nwb, -nwb),
+                 gt::all) +
+      prefac * phi.view(gt::all, gt::all, gt::all, 0, gt::newaxis, gt::newaxis,
+                        gt::newaxis);
+
+    gt::synchronize();
+
+    // spot check
+    EXPECT_EQ(hdist, expected);
+  }
 
-  gt::copy(a, h_a);
+  TEST(assign, device_broadcast_6d)
+  {
+    gt::gtensor_device<int, 6> a(gt::shape(8, 1, 2, 4, 1, 1), 0);
+    gt::gtensor_device<int, 6> b(gt::shape(8, 1, 2, 1, 1, 1), -7);
 
-  for (int i = 0; i < h_a.shape(0); i++) {
-    for (int j = 0; j < h_a.shape(2); j++) {
-      for (int k = 0; k < h_a.shape(3); k++) {
-        EXPECT_EQ(h_a(i, 0, j, k, 0, 0), -7);
+    gt::gtensor<int, 6> h_a(a.shape());
+
+    gt::assign(a, b);
+
+    gt::copy(a, h_a);
+
+    for (int i = 0; i < h_a.shape(0); i++) {
+      for (int j = 0; j < h_a.shape(2); j++) {
+        for (int k = 0; k < h_a.shape(3); k++) {
+          EXPECT_EQ(h_a(i, 0, j, k, 0, 0), -7);
+        }
       }
     }
   }
-}
+
+  TEST(assign, device_broadcast_7d)
+  {
+    gt::gtensor_device<int, 7> a(gt::shape(8, 1, 2, 4, 1, 1, 4), 0);
+    gt::gtensor_device<int, 7> b(gt::shape(8, 1, 2, 1, 1, 1, 4), -7);
+
+    gt::gtensor<int, 7> h_a(a.shape());
+
+    gt::assign(a, b);
+
+    gt::copy(a, h_a);
+
+    for (int i = 0; i < h_a.shape(0); i++) {
+      for (int j = 0; j < h_a.shape(2); j++) {
+        for (int k = 0; k < h_a.shape(3); k++) {
+          for (int l = 0; l < h_a.shape(6); l++) {
+            EXPECT_EQ(h_a(i, 0, j, k, 0, 0, l), -7);
+          }
+        }
+      }
+    }
 
 #endif // GTENSOR_HAVE_DEVICE

From 774360ebd60d0ed7cf89223fa2e4931096afc79b Mon Sep 17 00:00:00 2001
From: gabriele merlo <gmerlo00@login01.leonardo.local>
Date: Mon, 2 Sep 2024 15:02:57 +0200
Subject: [PATCH 5/7] Bugfix, forgot parenthesis

---
 tests/test_assign.cxx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_assign.cxx b/tests/test_assign.cxx
index ff63ac5f..1b1d6b5b 100644
--- a/tests/test_assign.cxx
+++ b/tests/test_assign.cxx
@@ -113,7 +113,7 @@ TEST(assign, broadcast_7d)
       }
     }
   }
-
+}
 #ifdef GTENSOR_HAVE_DEVICE
 
   TEST(assign, device_gtensor_6d)
@@ -478,5 +478,6 @@ TEST(assign, broadcast_7d)
         }
       }
     }
+  }
 
 #endif // GTENSOR_HAVE_DEVICE

From 9217325515e6c8819d0525fc6bda214f11f67a78 Mon Sep 17 00:00:00 2001
From: gabriele merlo <gmerlo00@login01.leonardo.local>
Date: Mon, 2 Sep 2024 16:44:51 +0200
Subject: [PATCH 6/7] run clang-format

---
 tests/test_assign.cxx | 580 +++++++++++++++++++++---------------------
 1 file changed, 289 insertions(+), 291 deletions(-)

diff --git a/tests/test_assign.cxx b/tests/test_assign.cxx
index 1b1d6b5b..06f76660 100644
--- a/tests/test_assign.cxx
+++ b/tests/test_assign.cxx
@@ -116,368 +116,366 @@ TEST(assign, broadcast_7d)
 }
 #ifdef GTENSOR_HAVE_DEVICE
 
-  TEST(assign, device_gtensor_6d)
-  {
-    gt::gtensor_device<int, 6> a(gt::shape(2, 3, 4, 5, 6, 7));
-    gt::gtensor_device<int, 6> b(a.shape());
-    gt::gtensor<int, 6> h_a(a.shape());
-    gt::gtensor<int, 6> h_b(a.shape());
-
-    int* adata = h_a.data();
-
-    for (int i = 0; i < a.size(); i++) {
-      adata[i] = i;
-    }
+TEST(assign, device_gtensor_6d)
+{
+  gt::gtensor_device<int, 6> a(gt::shape(2, 3, 4, 5, 6, 7));
+  gt::gtensor_device<int, 6> b(a.shape());
+  gt::gtensor<int, 6> h_a(a.shape());
+  gt::gtensor<int, 6> h_b(a.shape());
 
-    gt::copy(h_a, a);
-    b = a;
-    gt::copy(b, h_b);
+  int* adata = h_a.data();
 
-    EXPECT_EQ(h_a, h_b);
+  for (int i = 0; i < a.size(); i++) {
+    adata[i] = i;
   }
 
-  TEST(assign, device_gtensor_7d)
-  {
-    gt::gtensor_device<int, 7> a(gt::shape(2, 3, 4, 5, 6, 7, 8));
-    gt::gtensor_device<int, 7> b(a.shape());
-    gt::gtensor<int, 7> h_a(a.shape());
-    gt::gtensor<int, 7> h_b(a.shape());
+  gt::copy(h_a, a);
+  b = a;
+  gt::copy(b, h_b);
 
-    int* adata = h_a.data();
+  EXPECT_EQ(h_a, h_b);
+}
 
-    for (int i = 0; i < a.size(); i++) {
-      adata[i] = i;
-    }
+TEST(assign, device_gtensor_7d)
+{
+  gt::gtensor_device<int, 7> a(gt::shape(2, 3, 4, 5, 6, 7, 8));
+  gt::gtensor_device<int, 7> b(a.shape());
+  gt::gtensor<int, 7> h_a(a.shape());
+  gt::gtensor<int, 7> h_b(a.shape());
 
-    gt::copy(h_a, a);
-    b = a;
-    gt::copy(b, h_b);
+  int* adata = h_a.data();
 
-    EXPECT_EQ(h_a, h_b);
+  for (int i = 0; i < a.size(); i++) {
+    adata[i] = i;
   }
 
-  TEST(assign, device_gtensor_fill)
-  {
-    gt::gtensor_device<float, 2> a(gt::shape(2, 3));
-    gt::gtensor<float, 2> h_a(a.shape());
-
-    a.fill(9001);
+  gt::copy(h_a, a);
+  b = a;
+  gt::copy(b, h_b);
 
-    gt::copy(a, h_a);
-    EXPECT_EQ(h_a, (gt::gtensor_device<float, 2>{
-                     {9001, 9001}, {9001, 9001}, {9001, 9001}}));
-  }
+  EXPECT_EQ(h_a, h_b);
+}
 
-  TEST(assign, device_gview_fill)
-  {
-    gt::gtensor_device<float, 2> a(gt::shape(2, 3));
-    gt::gtensor<float, 2> h_a(a.shape());
-    auto av = a.view(gt::all, gt::all);
+TEST(assign, device_gtensor_fill)
+{
+  gt::gtensor_device<float, 2> a(gt::shape(2, 3));
+  gt::gtensor<float, 2> h_a(a.shape());
 
-    av.fill(9001);
+  a.fill(9001);
 
-    gt::copy(a, h_a);
-    EXPECT_EQ(
-      h_a, (gt::gtensor<float, 2>{{9001, 9001}, {9001, 9001}, {9001, 9001}}));
-  }
+  gt::copy(a, h_a);
+  EXPECT_EQ(h_a, (gt::gtensor_device<float, 2>{
+                   {9001, 9001}, {9001, 9001}, {9001, 9001}}));
+}
 
-  TEST(assign, device_span_fill)
-  {
-    gt::gtensor_device<float, 2> a(gt::shape(2, 3));
-    gt::gtensor<float, 2> h_a(a.shape());
-    auto as = a.to_kernel();
+TEST(assign, device_gview_fill)
+{
+  gt::gtensor_device<float, 2> a(gt::shape(2, 3));
+  gt::gtensor<float, 2> h_a(a.shape());
+  auto av = a.view(gt::all, gt::all);
 
-    as.fill(9001);
+  av.fill(9001);
 
-    gt::copy(a, h_a);
-    EXPECT_EQ(
-      h_a, (gt::gtensor<float, 2>{{9001, 9001}, {9001, 9001}, {9001, 9001}}));
-  }
+  gt::copy(a, h_a);
+  EXPECT_EQ(h_a,
+            (gt::gtensor<float, 2>{{9001, 9001}, {9001, 9001}, {9001, 9001}}));
+}
 
-  TEST(assign, device_gview_1d_scalar)
-  {
-    auto a = gt::empty_device<int>(gt::shape(5));
-    auto h_a = gt::empty_like(a);
-    auto aview = a.view(gt::all);
+TEST(assign, device_span_fill)
+{
+  gt::gtensor_device<float, 2> a(gt::shape(2, 3));
+  gt::gtensor<float, 2> h_a(a.shape());
+  auto as = a.to_kernel();
 
-    aview = gt::scalar(5);
+  as.fill(9001);
 
-    gt::copy(a, h_a);
+  gt::copy(a, h_a);
+  EXPECT_EQ(h_a,
+            (gt::gtensor<float, 2>{{9001, 9001}, {9001, 9001}, {9001, 9001}}));
+}
 
-    EXPECT_EQ(h_a, (gt::gtensor<int, 1>{5, 5, 5, 5, 5}));
-  }
+TEST(assign, device_gview_1d_scalar)
+{
+  auto a = gt::empty_device<int>(gt::shape(5));
+  auto h_a = gt::empty_like(a);
+  auto aview = a.view(gt::all);
 
-  TEST(assign, device_gtensor_large_2d)
-  {
-    gt::gtensor_device<int, 2> a(gt::shape(2, 17920000));
-    gt::gtensor_device<int, 2> b(a.shape());
-    gt::gtensor<int, 2> h_a(a.shape());
-    gt::gtensor<int, 2> h_b(a.shape());
+  aview = gt::scalar(5);
 
-    int* adata = h_a.data();
+  gt::copy(a, h_a);
 
-    for (int i = 0; i < a.size(); i++) {
-      adata[i] = i;
-    }
+  EXPECT_EQ(h_a, (gt::gtensor<int, 1>{5, 5, 5, 5, 5}));
+}
 
-    gt::copy(h_a, a);
-    // NB: b = a calls the default operator= which ends up triggering
-    // and underlying storage vector copy, usually a device memcpy, so
-    // it doesn't launch the gtensor assign kernel. Call assign directly
-    // to exercise the code
-    assign(b, a);
-    gt::copy(b, h_b);
+TEST(assign, device_gtensor_large_2d)
+{
+  gt::gtensor_device<int, 2> a(gt::shape(2, 17920000));
+  gt::gtensor_device<int, 2> b(a.shape());
+  gt::gtensor<int, 2> h_a(a.shape());
+  gt::gtensor<int, 2> h_b(a.shape());
 
-    EXPECT_EQ(h_a, h_b);
-  }
+  int* adata = h_a.data();
 
-  TEST(assign, device_view_noncontiguous_6d)
-  {
-    using T = gt::complex<double>;
-
-    int nzb = 2;
-    int nvb = 2;
-    int nwb = 2;
-
-    // ijklmn, no ghost
-    auto g_shape = gt::shape(32, 4, 48, 40, 30, 2);
-
-    // ijklmn, ghost in z, v, w
-    auto f_shape =
-      gt::shape(g_shape[0], g_shape[1], g_shape[2] + 2 * nzb,
-                g_shape[3] + 2 * nvb, g_shape[4] + 2 * nwb, g_shape[5]);
-    // i klmn, no ghost
-    auto papbar_shape =
-      gt::shape(g_shape[0], g_shape[2], g_shape[3], g_shape[4], g_shape[5]);
-    // ijz mn, ghost in z
-    auto bar_apar_shape =
-      gt::shape(g_shape[0], g_shape[1], f_shape[2], g_shape[4], g_shape[5]);
-    auto h_g = gt::full(g_shape, T(2.0));
-    auto d_g = gt::empty_device<T>(g_shape);
-    auto h_papbar = gt::full(papbar_shape, T(1.5));
-    auto d_papbar = gt::empty_device<T>(papbar_shape);
-    auto h_bar_apar = gt::full(bar_apar_shape, T(0.0, -1.0));
-    auto d_bar_apar = gt::empty_device<T>(bar_apar_shape);
-    auto h_f = gt::full<T>(f_shape, T(100.0));
-    auto d_f = gt::empty_device<T>(f_shape);
-
-    gt::copy(h_g, d_g);
-    gt::copy(h_papbar, d_papbar);
-    gt::copy(h_bar_apar, d_bar_apar);
-    gt::copy(h_f, d_f);
-
-    auto lhs_view =
-      d_f.view(gt::all, gt::all, gt::slice(nzb, -nzb), gt::slice(nvb, -nvb),
-               gt::slice(nwb, -nwb), gt::all);
-    auto d_papbar_view =
-      d_papbar.view(gt::all, gt::newaxis, gt::all, gt::all, gt::all, gt::all);
-    auto d_bar_apar_view = d_bar_apar.view(
-      gt::all, gt::all, gt::slice(nzb, -nzb), gt::newaxis, gt::all, gt::all);
-    auto rhs_view = d_g + d_papbar_view * d_bar_apar_view;
-
-    GT_DEBUG_VAR(d_g.shape());
-    GT_DEBUG_VAR(d_papbar_view.shape());
-    GT_DEBUG_VAR(d_bar_apar_view.shape());
-    GT_DEBUG_VAR(lhs_view.shape());
-    GT_DEBUG_VAR(rhs_view.shape());
-
-    lhs_view = rhs_view;
-
-    /*
-    d_f.view(gt::all, gt::all, gt::slice(nzb, -nzb), gt::slice(nvb, -nvb),
-    gt::slice(nwb, -nwb), gt::all) = d_g + d_papbar.view(gt::all, gt::newaxis,
-    gt::all, gt::all, gt::all, gt::all) * d_bar_apar.view(gt::all, gt::all,
-    gt::slice(nzb, -nzb), gt::newaxis, gt::all, gt::all);
-    */
-
-    gt::copy(d_f, h_f);
-
-    // spot check boundary, not changed
-    EXPECT_EQ(h_f(3, 3, nzb - 1, nvb, nwb, 1), T(100.0));
-    EXPECT_EQ(h_f(3, 3, nzb, nvb - 1, nwb, 1), T(100.0));
-    EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb - 1, 1), T(100.0));
-    EXPECT_EQ(h_f(0, 1, f_shape[2] - nzb, f_shape[3] - nvb - 1,
-                  f_shape[4] - nwb - 1, 1),
-              T(100.0));
-    EXPECT_EQ(h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb,
-                  f_shape[4] - nwb - 1, 1),
-              T(100.0));
-    EXPECT_EQ(h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb - 1,
-                  f_shape[4] - nwb, 1),
-              T(100.0));
-
-    // spot check inside that was changed
-    EXPECT_EQ(h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb - 1,
-                  f_shape[4] - nwb - 1, 1),
-              T(2.0, -1.5));
-    EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb, 1), T(2.0, -1.5));
-
-    gt::synchronize();
+  for (int i = 0; i < a.size(); i++) {
+    adata[i] = i;
   }
 
-  TEST(assign, device_view_noncontiguous_6d_scalar)
-  {
-    using T = gt::complex<double>;
+  gt::copy(h_a, a);
+  // NB: b = a calls the default operator= which ends up triggering
+  // and underlying storage vector copy, usually a device memcpy, so
+  // it doesn't launch the gtensor assign kernel. Call assign directly
+  // to exercise the code
+  assign(b, a);
+  gt::copy(b, h_b);
 
-    int nzb = 2;
-    int nvb = 2;
-    int nwb = 0;
-
-    // ijklmn, ghost in z, v, w
-    auto f_shape = gt::shape(5, 7, 9, 11, 13, 2);
-
-    auto h_f = gt::full<T>(f_shape, T(100.0));
-    auto d_f = gt::empty_device<T>(f_shape);
-
-    gt::copy(h_f, d_f);
+  EXPECT_EQ(h_a, h_b);
+}
 
-    auto f_size = d_f.size();
-    GT_DEBUG_VAR(d_f.shape());
-    GT_DEBUG_VAR(f_size);
+TEST(assign, device_view_noncontiguous_6d)
+{
+  using T = gt::complex<double>;
+
+  int nzb = 2;
+  int nvb = 2;
+  int nwb = 2;
+
+  // ijklmn, no ghost
+  auto g_shape = gt::shape(32, 4, 48, 40, 30, 2);
+
+  // ijklmn, ghost in z, v, w
+  auto f_shape =
+    gt::shape(g_shape[0], g_shape[1], g_shape[2] + 2 * nzb,
+              g_shape[3] + 2 * nvb, g_shape[4] + 2 * nwb, g_shape[5]);
+  // i klmn, no ghost
+  auto papbar_shape =
+    gt::shape(g_shape[0], g_shape[2], g_shape[3], g_shape[4], g_shape[5]);
+  // ijz mn, ghost in z
+  auto bar_apar_shape =
+    gt::shape(g_shape[0], g_shape[1], f_shape[2], g_shape[4], g_shape[5]);
+  auto h_g = gt::full(g_shape, T(2.0));
+  auto d_g = gt::empty_device<T>(g_shape);
+  auto h_papbar = gt::full(papbar_shape, T(1.5));
+  auto d_papbar = gt::empty_device<T>(papbar_shape);
+  auto h_bar_apar = gt::full(bar_apar_shape, T(0.0, -1.0));
+  auto d_bar_apar = gt::empty_device<T>(bar_apar_shape);
+  auto h_f = gt::full<T>(f_shape, T(100.0));
+  auto d_f = gt::empty_device<T>(f_shape);
+
+  gt::copy(h_g, d_g);
+  gt::copy(h_papbar, d_papbar);
+  gt::copy(h_bar_apar, d_bar_apar);
+  gt::copy(h_f, d_f);
+
+  auto lhs_view = d_f.view(gt::all, gt::all, gt::slice(nzb, -nzb),
+                           gt::slice(nvb, -nvb), gt::slice(nwb, -nwb), gt::all);
+  auto d_papbar_view =
+    d_papbar.view(gt::all, gt::newaxis, gt::all, gt::all, gt::all, gt::all);
+  auto d_bar_apar_view = d_bar_apar.view(gt::all, gt::all, gt::slice(nzb, -nzb),
+                                         gt::newaxis, gt::all, gt::all);
+  auto rhs_view = d_g + d_papbar_view * d_bar_apar_view;
+
+  GT_DEBUG_VAR(d_g.shape());
+  GT_DEBUG_VAR(d_papbar_view.shape());
+  GT_DEBUG_VAR(d_bar_apar_view.shape());
+  GT_DEBUG_VAR(lhs_view.shape());
+  GT_DEBUG_VAR(rhs_view.shape());
+
+  lhs_view = rhs_view;
+
+  /*
+  d_f.view(gt::all, gt::all, gt::slice(nzb, -nzb), gt::slice(nvb, -nvb),
+  gt::slice(nwb, -nwb), gt::all) = d_g + d_papbar.view(gt::all, gt::newaxis,
+  gt::all, gt::all, gt::all, gt::all) * d_bar_apar.view(gt::all, gt::all,
+  gt::slice(nzb, -nzb), gt::newaxis, gt::all, gt::all);
+  */
+
+  gt::copy(d_f, h_f);
+
+  // spot check boundary, not changed
+  EXPECT_EQ(h_f(3, 3, nzb - 1, nvb, nwb, 1), T(100.0));
+  EXPECT_EQ(h_f(3, 3, nzb, nvb - 1, nwb, 1), T(100.0));
+  EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb - 1, 1), T(100.0));
+  EXPECT_EQ(
+    h_f(0, 1, f_shape[2] - nzb, f_shape[3] - nvb - 1, f_shape[4] - nwb - 1, 1),
+    T(100.0));
+  EXPECT_EQ(
+    h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb, f_shape[4] - nwb - 1, 1),
+    T(100.0));
+  EXPECT_EQ(
+    h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb - 1, f_shape[4] - nwb, 1),
+    T(100.0));
+
+  // spot check inside that was changed
+  EXPECT_EQ(h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb - 1,
+                f_shape[4] - nwb - 1, 1),
+            T(2.0, -1.5));
+  EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb, 1), T(2.0, -1.5));
+
+  gt::synchronize();
+}
 
-    auto d_f_noghost =
-      d_f.view(gt::all, gt::all, gt::slice(nzb, -nzb), gt::slice(nvb, -nvb),
-               gt::slice(nwb, -nwb), gt::all);
-    auto f_noghost_size = d_f_noghost.size();
-    GT_DEBUG_TYPE(d_f_noghost);
-    GT_DEBUG_VAR(d_f_noghost.shape());
-    GT_DEBUG_VAR(f_noghost_size);
+TEST(assign, device_view_noncontiguous_6d_scalar)
+{
+  using T = gt::complex<double>;
 
-    d_f_noghost = 1.0;
+  int nzb = 2;
+  int nvb = 2;
+  int nwb = 0;
 
-    gt::copy(d_f, h_f);
+  // ijklmn, ghost in z, v, w
+  auto f_shape = gt::shape(5, 7, 9, 11, 13, 2);
 
-    // spot check boundary, not changed
-    EXPECT_EQ(h_f(3, 3, nzb - 1, nvb, nwb, 1), T(100.0));
-    EXPECT_EQ(h_f(3, 3, nzb, nvb - 1, nwb, 1), T(100.0));
-    EXPECT_EQ(h_f(0, 1, f_shape[2] - nzb, f_shape[3] - nvb - 1,
-                  f_shape[4] - nwb - 1, 1),
-              T(100.0));
-    EXPECT_EQ(h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb,
-                  f_shape[4] - nwb - 1, 1),
-              T(100.0));
+  auto h_f = gt::full<T>(f_shape, T(100.0));
+  auto d_f = gt::empty_device<T>(f_shape);
 
-    // note: interior, since nwb == 0
-    EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb, 1), T(1.0));
-    EXPECT_EQ(
-      h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb - 1, f_shape[4] - 1, 1),
-      T(1.0));
+  gt::copy(h_f, d_f);
 
-    // spot check inside that was changed
-    EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb, 1), T(1.0));
+  auto f_size = d_f.size();
+  GT_DEBUG_VAR(d_f.shape());
+  GT_DEBUG_VAR(f_size);
 
-    gt::synchronize();
-  }
+  auto d_f_noghost =
+    d_f.view(gt::all, gt::all, gt::slice(nzb, -nzb), gt::slice(nvb, -nvb),
+             gt::slice(nwb, -nwb), gt::all);
+  auto f_noghost_size = d_f_noghost.size();
+  GT_DEBUG_TYPE(d_f_noghost);
+  GT_DEBUG_VAR(d_f_noghost.shape());
+  GT_DEBUG_VAR(f_noghost_size);
+
+  d_f_noghost = 1.0;
+
+  gt::copy(d_f, h_f);
+
+  // spot check boundary, not changed
+  EXPECT_EQ(h_f(3, 3, nzb - 1, nvb, nwb, 1), T(100.0));
+  EXPECT_EQ(h_f(3, 3, nzb, nvb - 1, nwb, 1), T(100.0));
+  EXPECT_EQ(
+    h_f(0, 1, f_shape[2] - nzb, f_shape[3] - nvb - 1, f_shape[4] - nwb - 1, 1),
+    T(100.0));
+  EXPECT_EQ(
+    h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb, f_shape[4] - nwb - 1, 1),
+    T(100.0));
+
+  // note: interior, since nwb == 0
+  EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb, 1), T(1.0));
+  EXPECT_EQ(
+    h_f(0, 1, f_shape[2] - nzb - 1, f_shape[3] - nvb - 1, f_shape[4] - 1, 1),
+    T(1.0));
+
+  // spot check inside that was changed
+  EXPECT_EQ(h_f(3, 3, nzb, nvb, nwb, 1), T(1.0));
+
+  gt::synchronize();
+}
 
-  TEST(assign, device_gfunction_mismatch_throw)
-  {
-    using T = gt::complex<double>;
+TEST(assign, device_gfunction_mismatch_throw)
+{
+  using T = gt::complex<double>;
 
-    int nzb = 2;
-    int nvb = 2;
-    int nwb = 0;
+  int nzb = 2;
+  int nvb = 2;
+  int nwb = 0;
 
-    // ijklmn, ghost in z, v, w
-    auto f_shape = gt::shape(5, 7, 9, 11, 13, 2);
+  // ijklmn, ghost in z, v, w
+  auto f_shape = gt::shape(5, 7, 9, 11, 13, 2);
 
-    auto g_shape =
-      gt::shape(f_shape[0], f_shape[1], f_shape[2] - 2 * nzb,
-                f_shape[3] - 2 * nvb, f_shape[4] - 2 * nwb, f_shape[5]);
+  auto g_shape =
+    gt::shape(f_shape[0], f_shape[1], f_shape[2] - 2 * nzb,
+              f_shape[3] - 2 * nvb, f_shape[4] - 2 * nwb, f_shape[5]);
 
-    auto h_f = gt::full<T>(f_shape, T(100.0));
-    auto d_f = gt::empty_device<T>(f_shape);
-    auto h_g = gt::full(g_shape, T(2.0));
-    auto d_g = gt::empty_device<T>(g_shape);
+  auto h_f = gt::full<T>(f_shape, T(100.0));
+  auto d_f = gt::empty_device<T>(f_shape);
+  auto h_g = gt::full(g_shape, T(2.0));
+  auto d_g = gt::empty_device<T>(g_shape);
 
-    gt::copy(h_f, d_f);
-    gt::copy(h_g, d_g);
+  gt::copy(h_f, d_f);
+  gt::copy(h_g, d_g);
 
-    EXPECT_THROW(h_g + h_f, std::runtime_error);
-    EXPECT_THROW(d_g + d_f, std::runtime_error);
-  }
+  EXPECT_THROW(h_g + h_f, std::runtime_error);
+  EXPECT_THROW(d_g + d_f, std::runtime_error);
+}
 
-  namespace test
-  {
-  template <typename T, gt::size_type N>
-  using gtensor_managed =
-    gt::gtensor_container<gt::space::managed_vector<T>, N>;
-  } // end namespace test
+namespace test
+{
+template <typename T, gt::size_type N>
+using gtensor_managed = gt::gtensor_container<gt::space::managed_vector<T>, N>;
+} // end namespace test
 
-  TEST(assign, device_gene_h_from_f)
-  {
-    using T = gt::complex<double>;
+TEST(assign, device_gene_h_from_f)
+{
+  using T = gt::complex<double>;
 
-    const int nwb = 0;
+  const int nwb = 0;
 
-    // ijklmn, ghost in w
-    auto hdist_shape = gt::shape(32, 4, 48, 40, 30 + 2 * nwb, 2);
-    auto fdist_shape = gt::shape(32, 4, 48, 40, 30 + 2 * nwb, 2);
+  // ijklmn, ghost in w
+  auto hdist_shape = gt::shape(32, 4, 48, 40, 30 + 2 * nwb, 2);
+  auto fdist_shape = gt::shape(32, 4, 48, 40, 30 + 2 * nwb, 2);
 
-    // ijklmn, no ghost
-    auto prefac_shape = gt::shape(32, 4, 48, 40, 30, 2);
+  // ijklmn, no ghost
+  auto prefac_shape = gt::shape(32, 4, 48, 40, 30, 2);
 
-    // ijk + ? axis
-    auto phi_shape = gt::shape(32, 4, 48, 2);
+  // ijk + ? axis
+  auto phi_shape = gt::shape(32, 4, 48, 2);
 
-    test::gtensor_managed<T, 6> hdist(hdist_shape, T(2.0));
-    test::gtensor_managed<T, 6> fdist(fdist_shape, T(1.0));
-    test::gtensor_managed<T, 6> prefac(prefac_shape, T(-1.0));
-    test::gtensor_managed<T, 4> phi(phi_shape, T(-2.0));
-    gt::gtensor<T, 6> expected(hdist_shape, T(3.0));
+  test::gtensor_managed<T, 6> hdist(hdist_shape, T(2.0));
+  test::gtensor_managed<T, 6> fdist(fdist_shape, T(1.0));
+  test::gtensor_managed<T, 6> prefac(prefac_shape, T(-1.0));
+  test::gtensor_managed<T, 4> phi(phi_shape, T(-2.0));
+  gt::gtensor<T, 6> expected(hdist_shape, T(3.0));
 
-    hdist.view(gt::all, gt::all, gt::all, gt::all, gt::slice(nwb, -nwb),
-               gt::all) =
-      fdist.view(gt::all, gt::all, gt::all, gt::all, gt::slice(nwb, -nwb),
-                 gt::all) +
-      prefac * phi.view(gt::all, gt::all, gt::all, 0, gt::newaxis, gt::newaxis,
-                        gt::newaxis);
+  hdist.view(gt::all, gt::all, gt::all, gt::all, gt::slice(nwb, -nwb),
+             gt::all) =
+    fdist.view(gt::all, gt::all, gt::all, gt::all, gt::slice(nwb, -nwb),
+               gt::all) +
+    prefac * phi.view(gt::all, gt::all, gt::all, 0, gt::newaxis, gt::newaxis,
+                      gt::newaxis);
 
-    gt::synchronize();
+  gt::synchronize();
 
-    // spot check
-    EXPECT_EQ(hdist, expected);
-  }
+  // spot check
+  EXPECT_EQ(hdist, expected);
+}
 
-  TEST(assign, device_broadcast_6d)
-  {
-    gt::gtensor_device<int, 6> a(gt::shape(8, 1, 2, 4, 1, 1), 0);
-    gt::gtensor_device<int, 6> b(gt::shape(8, 1, 2, 1, 1, 1), -7);
+TEST(assign, device_broadcast_6d)
+{
+  gt::gtensor_device<int, 6> a(gt::shape(8, 1, 2, 4, 1, 1), 0);
+  gt::gtensor_device<int, 6> b(gt::shape(8, 1, 2, 1, 1, 1), -7);
 
-    gt::gtensor<int, 6> h_a(a.shape());
+  gt::gtensor<int, 6> h_a(a.shape());
 
-    gt::assign(a, b);
+  gt::assign(a, b);
 
-    gt::copy(a, h_a);
+  gt::copy(a, h_a);
 
-    for (int i = 0; i < h_a.shape(0); i++) {
-      for (int j = 0; j < h_a.shape(2); j++) {
-        for (int k = 0; k < h_a.shape(3); k++) {
-          EXPECT_EQ(h_a(i, 0, j, k, 0, 0), -7);
-        }
+  for (int i = 0; i < h_a.shape(0); i++) {
+    for (int j = 0; j < h_a.shape(2); j++) {
+      for (int k = 0; k < h_a.shape(3); k++) {
+        EXPECT_EQ(h_a(i, 0, j, k, 0, 0), -7);
       }
     }
   }
+}
 
-  TEST(assign, device_broadcast_7d)
-  {
-    gt::gtensor_device<int, 7> a(gt::shape(8, 1, 2, 4, 1, 1, 4), 0);
-    gt::gtensor_device<int, 7> b(gt::shape(8, 1, 2, 1, 1, 1, 4), -7);
+TEST(assign, device_broadcast_7d)
+{
+  gt::gtensor_device<int, 7> a(gt::shape(8, 1, 2, 4, 1, 1, 4), 0);
+  gt::gtensor_device<int, 7> b(gt::shape(8, 1, 2, 1, 1, 1, 4), -7);
 
-    gt::gtensor<int, 7> h_a(a.shape());
+  gt::gtensor<int, 7> h_a(a.shape());
 
-    gt::assign(a, b);
+  gt::assign(a, b);
 
-    gt::copy(a, h_a);
+  gt::copy(a, h_a);
 
-    for (int i = 0; i < h_a.shape(0); i++) {
-      for (int j = 0; j < h_a.shape(2); j++) {
-        for (int k = 0; k < h_a.shape(3); k++) {
-          for (int l = 0; l < h_a.shape(6); l++) {
-            EXPECT_EQ(h_a(i, 0, j, k, 0, 0, l), -7);
-          }
+  for (int i = 0; i < h_a.shape(0); i++) {
+    for (int j = 0; j < h_a.shape(2); j++) {
+      for (int k = 0; k < h_a.shape(3); k++) {
+        for (int l = 0; l < h_a.shape(6); l++) {
+          EXPECT_EQ(h_a(i, 0, j, k, 0, 0, l), -7);
         }
       }
     }
   }
+}
 
 #endif // GTENSOR_HAVE_DEVICE

From 1480f034c4bb9aea913c0db3d6392ee80c53a85d Mon Sep 17 00:00:00 2001
From: Bryce Allen <bryce@bda.space>
Date: Tue, 17 Sep 2024 12:48:43 -0400
Subject: [PATCH 7/7] tests: fix 7d launch init indexing

---
 tests/test_launch.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_launch.cxx b/tests/test_launch.cxx
index 0b86cb95..6ad2a378 100644
--- a/tests/test_launch.cxx
+++ b/tests/test_launch.cxx
@@ -271,7 +271,7 @@ TEST(gtensor, device_launch_7d)
         for (int l = 0; l < h_a.shape(3); l++) {
           for (int m = 0; m < h_a.shape(4); m++) {
             for (int n = 0; n < h_a.shape(5); n++) {
-              for (int o = 0; n < h_a.shape(6); o++) {
+              for (int o = 0; o < h_a.shape(6); o++) {
                 h_a(i, j, k, l, m, n, o) = i + j + k + l + m + n + o;
               }
             }