From c4acebd185997f6f95157635dc9be7d034e06dc4 Mon Sep 17 00:00:00 2001
From: Roger Pawlowski <rppawlo@sandia.gov>
Date: Wed, 30 Jun 2021 16:20:28 -0600
Subject: [PATCH 01/14] Panzer: fix some disc-fe core tests for UVM off

---
 .../src/Panzer_IntrepidOrientation.cpp        |  2 +-
 .../core_tests/hierarchic_team_policy.cpp     |  4 +-
 .../test/core_tests/zero_sensitivities.cpp    | 50 ++++++++++++-------
 3 files changed, 37 insertions(+), 19 deletions(-)
diff --git a/packages/panzer/disc-fe/src/Panzer_IntrepidOrientation.cpp b/packages/panzer/disc-fe/src/Panzer_IntrepidOrientation.cpp
index a74cde808066..6367820cd334 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntrepidOrientation.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntrepidOrientation.cpp
@@ -94,7 +94,7 @@ namespace panzer {
       // construct orientation information
       for (int c=0;c<numElementsPerBlock;++c) {
         const int localCellId = elementBlock.at(c);
-        PHX::View<const panzer::GlobalOrdinal*>
+        Kokkos::View<const panzer::GlobalOrdinal*,Kokkos::HostSpace>
           nodes(connMgr.getConnectivity(localCellId), numVertexPerCell);
         orientation[localCellId] = (Intrepid2::Orientation::getOrientation(cellTopo, nodes));
       }
diff --git a/packages/panzer/disc-fe/test/core_tests/hierarchic_team_policy.cpp b/packages/panzer/disc-fe/test/core_tests/hierarchic_team_policy.cpp
index 4da950f4e7b8..9db18fc0a4f2 100644
--- a/packages/panzer/disc-fe/test/core_tests/hierarchic_team_policy.cpp
+++ b/packages/panzer/disc-fe/test/core_tests/hierarchic_team_policy.cpp
@@ -44,6 +44,8 @@
 #include <Teuchos_UnitTestHarness.hpp>
 #include <Teuchos_RCP.hpp>
 
+#include "KokkosExp_View_Fad.hpp"
+#include "Kokkos_DynRankView_Fad.hpp"
 #include "Panzer_HierarchicParallelism.hpp"
 #include "Sacado.hpp"
 
@@ -88,7 +90,7 @@ namespace panzer_test {
 
     for (int i=0; i < M; ++i) {
       for (int j=0; j < N; ++j) {
-        TEST_FLOATING_EQUALITY(Sacado::ScalarValue<Scalar>::eval(a(i,j)),3.0,tol);
+        TEST_FLOATING_EQUALITY(Sacado::ScalarValue<Scalar>::eval(a_host(i,j)),3.0,tol);
       }
     }
   }
diff --git a/packages/panzer/disc-fe/test/core_tests/zero_sensitivities.cpp b/packages/panzer/disc-fe/test/core_tests/zero_sensitivities.cpp
index d2c0ad0466b1..f17968209c6c 100644
--- a/packages/panzer/disc-fe/test/core_tests/zero_sensitivities.cpp
+++ b/packages/panzer/disc-fe/test/core_tests/zero_sensitivities.cpp
@@ -43,6 +43,8 @@
 #include <Teuchos_ConfigDefs.hpp>
 #include <Teuchos_UnitTestHarness.hpp>
 
+#include "KokkosExp_View_Fad.hpp"
+#include "Kokkos_DynRankView_Fad.hpp"
 #include "Phalanx_DataLayout_MDALayout.hpp"
 #include "Phalanx_MDField.hpp"
 #include "Phalanx_KokkosViewFactory.hpp"
@@ -137,20 +139,23 @@ namespace panzer {
       a.setFieldData(PHX::KokkosViewFactory<panzer::Traits::RealType,typename PHX::DevLayout<panzer::Traits::RealType>::type,PHX::Device>::buildView(a.fieldTag()));
       
       // initialize
-      for (int cell = 0; cell < a.extent_int(0); ++cell) {
-	for (int pt=0; pt < a.extent_int(1); ++pt) {
-	  a(cell,pt) = 2.0;
-	}
-      }
+      a.deep_copy(2.0);
+      // for (int cell = 0; cell < a.extent_int(0); ++cell) {
+      //   for (int pt=0; pt < a.extent_int(1); ++pt) {
+      //     a(cell,pt) = 2.0;
+      //   }
+      // }
       
       // Compute
       Kokkos::parallel_for(a.extent_int(0),ComputeA<panzer::Traits::RealType,PHX::Device,PHX::MDField<panzer::Traits::RealType,Cell,BASIS> > (a));
       typename PHX::Device().fence();
+
+      auto a_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),a.get_static_view());
       
       // Check
       for (int cell = 0; cell < a.extent_int(0); ++cell)
 	for (int pt=0; pt < a.extent_int(1); ++pt)
-	  TEST_FLOATING_EQUALITY(a(cell,pt),2.0,tolerance);
+	  TEST_FLOATING_EQUALITY(a_host(cell,pt),2.0,tolerance);
     }
 
     // ***********************
@@ -162,28 +167,39 @@ namespace panzer {
       derivative_dimension.push_back(2);
       a.setFieldData(PHX::KokkosViewFactory<panzer::Traits::FadType,typename PHX::DevLayout<panzer::Traits::FadType>::type,PHX::Device>::buildView(a.fieldTag(),derivative_dimension));
       
-      // initialize
+      // Initialize (use raw kokkos to avoid compiler warning about MDField dtor with host function calls).
+      auto a_dev = a.get_static_view();
+      Kokkos::parallel_for("initialize zero sensitivity vec",a.extent(0),KOKKOS_LAMBDA (const int cell) {
+          //for (int cell = 0; cell < a.extent_int(0); ++cell) {
+	for (int pt=0; pt < a_dev.extent_int(1); ++pt) {
+	  a_dev(cell,pt) = 1.0;
+	  a_dev(cell,pt).fastAccessDx(0) = 2.0;
+	  a_dev(cell,pt).fastAccessDx(1) = 3.0;
+	}
+      });
+
+      // Check initial values
+      auto a_host = Kokkos::create_mirror_view(a.get_static_view());
+      Kokkos::deep_copy(a_host,a.get_static_view());
       for (int cell = 0; cell < a.extent_int(0); ++cell) {
 	for (int pt=0; pt < a.extent_int(1); ++pt) {
-	  a(cell,pt) = 1.0;
-	  a(cell,pt).fastAccessDx(0) = 2.0;
-	  a(cell,pt).fastAccessDx(1) = 3.0;
-	  TEST_FLOATING_EQUALITY(a(cell,pt).val(),1.0,1e-12);
-	  TEST_FLOATING_EQUALITY(a(cell,pt).fastAccessDx(0),2.0,1e-12);
-	  TEST_FLOATING_EQUALITY(a(cell,pt).fastAccessDx(1),3.0,1e-12);
+	  TEST_FLOATING_EQUALITY(a_host(cell,pt).val(),1.0,1e-12);
+	  TEST_FLOATING_EQUALITY(a_host(cell,pt).fastAccessDx(0),2.0,1e-12);
+	  TEST_FLOATING_EQUALITY(a_host(cell,pt).fastAccessDx(1),3.0,1e-12);
 	}
       }
-      
+
       // Compute
       Kokkos::parallel_for(a.extent(0),ComputeB<PHX::Device,PHX::MDField<panzer::Traits::FadType,Cell,BASIS> > (a));
       typename PHX::Device().fence();
       
       // Check
+      Kokkos::deep_copy(a_host,a.get_static_view());
       for (int cell = 0; cell < a.extent_int(0); ++cell) {
 	for (int pt=0; pt < a.extent_int(1); ++pt) {
-	  TEST_FLOATING_EQUALITY(a(cell,pt).val(),1.0,1e-12);
-	  TEST_FLOATING_EQUALITY(a(cell,pt).fastAccessDx(0),0.0,1e-12);
-	  TEST_FLOATING_EQUALITY(a(cell,pt).fastAccessDx(1),0.0,1e-12);
+	  TEST_FLOATING_EQUALITY(a_host(cell,pt).val(),1.0,1e-12);
+	  TEST_FLOATING_EQUALITY(a_host(cell,pt).fastAccessDx(0),0.0,1e-12);
+	  TEST_FLOATING_EQUALITY(a_host(cell,pt).fastAccessDx(1),0.0,1e-12);
 	}
       }
 

From 52c15162f61e3948f9ae9fac292b21b60d91752b Mon Sep 17 00:00:00 2001
From: Roger Pawlowski <rppawlo@sandia.gov>
Date: Thu, 1 Jul 2021 06:22:49 -0600
Subject: [PATCH 02/14] Panzer: fix core/evaluators tests for no-UVM

---
 .../panzer/disc-fe/src/evaluators/Panzer_Sum_impl.hpp  |  6 ++++--
 packages/panzer/disc-fe/test/core_tests/evaluators.cpp | 10 ----------
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/packages/panzer/disc-fe/src/evaluators/Panzer_Sum_impl.hpp b/packages/panzer/disc-fe/src/evaluators/Panzer_Sum_impl.hpp
index fa0761d4619d..456a8a6e2a8e 100644
--- a/packages/panzer/disc-fe/src/evaluators/Panzer_Sum_impl.hpp
+++ b/packages/panzer/disc-fe/src/evaluators/Panzer_Sum_impl.hpp
@@ -65,6 +65,7 @@ Sum(
 
   // check if the user wants to scale each term independently
   auto local_scalars = PHX::View<double *>("scalars",value_names->size());
+  auto local_scalars_host = Kokkos::create_mirror_view(local_scalars);
   if(p.isType<Teuchos::RCP<const std::vector<double> > >("Scalars")) {
     auto scalars_v = *p.get<Teuchos::RCP<const std::vector<double> > >("Scalars");
 
@@ -72,12 +73,13 @@ Sum(
     TEUCHOS_ASSERT(scalars_v.size()==value_names->size());
 
     for (std::size_t i=0; i < value_names->size(); ++i)
-      local_scalars(i) = scalars_v[i];
+      local_scalars_host(i) = scalars_v[i];
   }
   else {
     for (std::size_t i=0; i < value_names->size(); ++i)
-      local_scalars(i) = 1.0;
+      local_scalars_host(i) = 1.0;
   }
+  Kokkos::deep_copy(local_scalars,local_scalars_host);
 
   scalars = local_scalars; 
   
diff --git a/packages/panzer/disc-fe/test/core_tests/evaluators.cpp b/packages/panzer/disc-fe/test/core_tests/evaluators.cpp
index d5d4c5e06231..ca43b3e47662 100644
--- a/packages/panzer/disc-fe/test/core_tests/evaluators.cpp
+++ b/packages/panzer/disc-fe/test/core_tests/evaluators.cpp
@@ -96,8 +96,6 @@ namespace panzer {
 
   TEUCHOS_UNIT_TEST(evaluators, Sum)
   {
-    
-
     ParameterList p("Sum Test");
     p.set("Sum Name", "Sum of Sources");
     RCP<vector<string> > value_names = rcp(new vector<string>);
@@ -114,8 +112,6 @@ namespace panzer {
 
   TEUCHOS_UNIT_TEST(evaluators, ScalarToVector)
   {
-    
-
     ParameterList p("ScalarToVector Test");
     p.set("Vector Name", "U");
     RCP<vector<string> > scalar_names = rcp(new vector<string>);
@@ -136,8 +132,6 @@ namespace panzer {
 
   TEUCHOS_UNIT_TEST(evaluators, VectorToScalar)
   {
-    
-
     ParameterList p("VectorToScalar Test");
     p.set("Vector Name", "U");
     RCP<vector<string> > scalar_names = rcp(new vector<string>);
@@ -158,8 +152,6 @@ namespace panzer {
 
   TEUCHOS_UNIT_TEST(evaluators, DirichletResidual)
   {
-    
-
     ParameterList p("DirichletResidual Test");
     p.set("Residual Name", "Residual_TEMP");
     p.set("DOF Name", "TEMP");
@@ -173,7 +165,6 @@ namespace panzer {
 
   TEUCHOS_UNIT_TEST(evaluators, DOF)
   {
-    
     using panzer::IntegrationRule;
     using panzer::BasisIRLayout;
 
@@ -298,7 +289,6 @@ namespace panzer {
 
   TEUCHOS_UNIT_TEST(evaluators, Integrator_GradBasisDotVector)
   {
-    
     using panzer::IntegrationRule;
     using panzer::BasisIRLayout;
 

From db999000e6e66740e34d8841b4122515404b4540 Mon Sep 17 00:00:00 2001
From: Roger Pawlowski <rppawlo@sandia.gov>
Date: Thu, 1 Jul 2021 15:02:37 -0600
Subject: [PATCH 03/14] Panzer: start of uvm removal for integration values
 object

---
 .../disc-fe/src/Panzer_IntegrationValues2.cpp | 134 +++++++++--------
 .../disc-fe/src/Panzer_IntegrationValues2.hpp |  10 +-
 .../test/core_tests/integration_values2.cpp   | 140 +++++++++++-------
 3 files changed, 158 insertions(+), 126 deletions(-)

diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
index 6c995b7b2820..1e9660a24e31 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
@@ -292,7 +292,7 @@ getCubature(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordinates,
 
   int num_space_dim = int_rule->topology->getDimension();
   if (int_rule->isSide() && num_space_dim==1) {
-    std::cout << "WARNING: 0-D quadrature rule ifrastructure does not exist!!! Will not be able to do "
+    std::cout << "WARNING: 0-D quadrature rule infrastructure does not exist!!! Will not be able to do "
         << "non-natural integration rules.";
     return;
   }
@@ -895,21 +895,12 @@ evaluateRemainingValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordi
 
   // copy the dynamic data structures into the static data structures
   {
-    size_type num_ip = dyn_cub_points.extent(0);
-    size_type num_dims = dyn_cub_points.extent(1);
-
-    for (size_type ip = 0; ip < num_ip;  ++ip) {
-      cub_weights(ip) = dyn_cub_weights(ip);
-      for (size_type dim = 0; dim < num_dims; ++dim)
-        cub_points(ip,dim) = dyn_cub_points(ip,dim);
-    }
+    Kokkos::deep_copy(cub_weights.get_static_view(),dyn_cub_weights.get_view());
+    Kokkos::deep_copy(cub_points.get_static_view(),dyn_cub_points.get_view());
   }
 
   if (int_rule->isSide()) {
-    const size_type num_ip = dyn_cub_points.extent(0), num_side_dims = dyn_side_cub_points.extent(1);
-    for (size_type ip = 0; ip < num_ip; ++ip)
-      for (size_type dim = 0; dim < num_side_dims; ++dim)
-        side_cub_points(ip,dim) = dyn_side_cub_points(ip,dim);
+    Kokkos::deep_copy(side_cub_points.get_static_view(),dyn_side_cub_points.get_view());
   }
 
   const int num_cells = in_num_cells < 0 ? in_node_coordinates.extent(0) : in_num_cells;
@@ -917,15 +908,14 @@ evaluateRemainingValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordi
   {
     size_type num_nodes = in_node_coordinates.extent(1);
     size_type num_dims = in_node_coordinates.extent(2);
-
-    for (int cell = 0; cell < num_cells;  ++cell) {
-      for (size_type node = 0; node < num_nodes; ++node) {
-        for (size_type dim = 0; dim < num_dims; ++dim) {
-          node_coordinates(cell,node,dim) =
-              in_node_coordinates(cell,node,dim);
-        }
-      }
-    }
+    auto node_coordinates_k = node_coordinates.get_view();
+    auto in_node_coordinates_k = in_node_coordinates.get_view();
+
+    Kokkos::MDRangePolicy<PHX::Device,Kokkos::Rank<3>> policy({0,0,0},{num_cells,num_nodes,num_dims});
+    Kokkos::parallel_for("node coordinates",policy,KOKKOS_LAMBDA (const int cell,const int node,const int dim) {
+      node_coordinates_k(cell,node,dim) = in_node_coordinates_k(cell,node,dim);
+    });
+    PHX::Device::execution_space().fence();
   }
 
   auto s_in_node_coordinates = Kokkos::subview(in_node_coordinates.get_view(),std::make_pair(0,num_cells),Kokkos::ALL(),Kokkos::ALL());
@@ -961,24 +951,27 @@ evaluateRemainingValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordi
   else TEUCHOS_ASSERT(false);
 
   // Shakib contravarient metric tensor
-  for (int cell = 0; cell < num_cells; ++cell) {
-    for (size_type ip = 0; ip < contravarient.extent(1); ++ip) {
-
+  {
+    auto covarient_k = covarient.get_view();
+    auto contravarient_k = contravarient.get_view();
+    auto jac_k = jac.get_view();
+    Kokkos::MDRangePolicy<PHX::Device,Kokkos::Rank<2>> policy({0,0},{num_cells,static_cast<int>(contravarient.extent(1))});
+    Kokkos::parallel_for("evalaute covarient metric tensor",policy,KOKKOS_LAMBDA (const int cell,const int ip) {
       // zero out matrix
-      for (size_type i = 0; i < contravarient.extent(2); ++i)
-        for (size_type j = 0; j < contravarient.extent(3); ++j)
-          covarient(cell,ip,i,j) = 0.0;
+      for (int i = 0; i < static_cast<int>(contravarient_k.extent(2)); ++i)
+        for (int j = 0; j < static_cast<int>(contravarient_k.extent(3)); ++j)
+          covarient_k(cell,ip,i,j) = 0.0;
 
       // g^{ij} = \frac{\parital x_i}{\partial \chi_\alpha}\frac{\parital x_j}{\partial \chi_\alpha}
-      for (size_type i = 0; i < contravarient.extent(2); ++i) {
-        for (size_type j = 0; j < contravarient.extent(3); ++j) {
-          for (size_type alpha = 0; alpha < contravarient.extent(2); ++alpha) {
-            covarient(cell,ip,i,j) += jac(cell,ip,i,alpha) * jac(cell,ip,j,alpha);
+      for (int i = 0; i < static_cast<int>(contravarient_k.extent(2)); ++i) {
+        for (int j = 0; j < static_cast<int>(contravarient_k.extent(3)); ++j) {
+          for (int alpha = 0; alpha < static_cast<int>(contravarient_k.extent(2)); ++alpha) {
+            covarient_k(cell,ip,i,j) += jac_k(cell,ip,i,alpha) * jac_k(cell,ip,j,alpha);
           }
         }
       }
-
-    }
+    });
+    PHX::Device::execution_space().fence();
   }
 
   auto s_covarient = Kokkos::subview(covarient.get_view(),std::make_pair(0,num_cells),Kokkos::ALL(),Kokkos::ALL(),Kokkos::ALL());
@@ -986,16 +979,20 @@ evaluateRemainingValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordi
   Intrepid2::RealSpaceTools<PHX::Device::execution_space>::inverse(s_contravarient, s_covarient);
 
   // norm of g_ij
-  for (int cell = 0; cell < num_cells; ++cell) {
-    for (size_type ip = 0; ip < contravarient.extent(1); ++ip) {
-      norm_contravarient(cell,ip) = 0.0;
-      for (size_type i = 0; i < contravarient.extent(2); ++i) {
-        for (size_type j = 0; j < contravarient.extent(3); ++j) {
-          norm_contravarient(cell,ip) += contravarient(cell,ip,i,j) * contravarient(cell,ip,i,j);
+  {
+    auto contravarient_k = contravarient.get_view();
+    auto norm_contravarient_k = norm_contravarient.get_view();
+    Kokkos::MDRangePolicy<PHX::Device,Kokkos::Rank<2>> policy({0,0},{num_cells,static_cast<int>(contravarient.extent(1))});
+    Kokkos::parallel_for("evaluate norm_contravarient",policy,KOKKOS_LAMBDA (const int cell,const int ip) {
+      norm_contravarient_k(cell,ip) = 0.0;
+      for (int i = 0; i < static_cast<int>(contravarient_k.extent(2)); ++i) {
+        for (int j = 0; j < static_cast<int>(contravarient_k.extent(3)); ++j) {
+          norm_contravarient_k(cell,ip) += contravarient_k(cell,ip,i,j) * contravarient_k(cell,ip,i,j);
         }
       }
-      norm_contravarient(cell,ip) = std::sqrt(norm_contravarient(cell,ip));
-    }
+      norm_contravarient_k(cell,ip) = std::sqrt(norm_contravarient_k(cell,ip));
+    });
+    PHX::Device::execution_space().fence();
   }
 }
 
@@ -1172,17 +1169,16 @@ getCubatureCV(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordinates,
   {
     size_type num_nodes = in_node_coordinates.extent(1);
     size_type num_dims = in_node_coordinates.extent(2);
-
-    for (size_type cell = 0; cell < num_cells;  ++cell) {
-      for (size_type node = 0; node < num_nodes; ++node) {
-        for (size_type dim = 0; dim < num_dims; ++dim) {
-          node_coordinates(cell,node,dim) =
-              in_node_coordinates(cell,node,dim);
-          dyn_node_coordinates(cell,node,dim) =
-              Sacado::ScalarValue<Scalar>::eval(in_node_coordinates(cell,node,dim));
-        }
-      }
-    }
+    auto node_coordinates_k = node_coordinates.get_view();
+    auto dyn_node_coordinates_k = dyn_node_coordinates.get_view();
+    auto in_node_coordinates_k = in_node_coordinates.get_view();
+
+    Kokkos::MDRangePolicy<PHX::Device,Kokkos::Rank<3>> policy({0,0,0},{num_cells,num_nodes,num_dims});
+    Kokkos::parallel_for("getCubatureCV: node coordinates",policy,KOKKOS_LAMBDA (const int cell,const int node,const int dim) {
+      node_coordinates_k(cell,node,dim) = in_node_coordinates_k(cell,node,dim);
+      dyn_node_coordinates_k(cell,node,dim) = Sacado::ScalarValue<Scalar>::eval(in_node_coordinates_k(cell,node,dim));
+    });
+    PHX::Device::execution_space().fence();
   }
 
   auto s_dyn_phys_cub_points = Kokkos::subdynrankview(dyn_phys_cub_points.get_view(),cell_range,Kokkos::ALL(),Kokkos::ALL(),Kokkos::ALL(),Kokkos::ALL(),Kokkos::ALL(),Kokkos::ALL());
@@ -1199,19 +1195,27 @@ getCubatureCV(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordinates,
   // size_type num_cells = dyn_phys_cub_points.extent(0);
   size_type num_ip =dyn_phys_cub_points.extent(1);
   size_type num_dims = dyn_phys_cub_points.extent(2);
-
-  for (size_type cell = 0; cell < num_cells;  ++cell) {
-    for (size_type ip = 0; ip < num_ip;  ++ip) {
-      if (int_rule->cv_type != "side")
-        weighted_measure(cell,ip) = dyn_phys_cub_weights(cell,ip);
-      for (size_type dim = 0; dim < num_dims; ++dim) {
-        ip_coordinates(cell,ip,dim) = dyn_phys_cub_points(cell,ip,dim);
-        if (int_rule->cv_type == "side")
-          weighted_normals(cell,ip,dim) = dyn_phys_cub_norms(cell,ip,dim);
-      }
+  auto weighted_measure_k = weighted_measure.get_view();
+  auto dyn_phys_cub_weights_k = dyn_phys_cub_weights.get_view();
+  auto ip_coordinates_k = ip_coordinates.get_view();
+  auto dyn_phys_cub_points_k = dyn_phys_cub_points.get_view();
+  auto weighted_normals_k = weighted_normals.get_view();
+  auto dyn_phys_cub_norms_k = dyn_phys_cub_norms.get_view();
+  bool is_side = false;
+  if (int_rule->cv_type == "side")
+    is_side = true;
+
+  Kokkos::MDRangePolicy<PHX::Device,Kokkos::Rank<2>> policy({0,0},{num_cells,num_ip});
+  Kokkos::parallel_for("getCubatureCV: weighted measure",policy,KOKKOS_LAMBDA (const int cell,const int ip) {
+    if (!is_side)
+      weighted_measure_k(cell,ip) = dyn_phys_cub_weights_k(cell,ip);
+    for (size_type dim = 0; dim < num_dims; ++dim) {
+      ip_coordinates_k(cell,ip,dim) = dyn_phys_cub_points_k(cell,ip,dim);
+      if (is_side)
+        weighted_normals_k(cell,ip,dim) = dyn_phys_cub_norms_k(cell,ip,dim);
     }
-  }
-
+  });
+  PHX::Device::execution_space().fence();
 }
 
 template <typename Scalar>
diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
index bc088b1d403f..5b0d10854f26 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
@@ -159,15 +159,17 @@ namespace panzer {
     void swapQuadraturePoints(int cell,int a,int b);
 
     static void convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scalar binormal[3]);
-    
-  protected:
 
+    /// This should be a private method, but using lambdas on cuda forces this to be public.
+    void evaluateRemainingValues(const PHX::MDField<Scalar,Cell,NODE,Dim> & in_node_coordinates, const int in_num_cells);
+    /// This should be a private method, but using lambdas on cuda forces this to be public.
+    void getCubatureCV(const PHX::MDField<Scalar,Cell,NODE,Dim> & in_node_coordinates, const int in_num_cells);
 
+  protected:
 
     // TODO: Make this a utility function that only exists in source file
     Teuchos::RCP<Intrepid2::Cubature<PHX::Device::execution_space,double,double>> getIntrepidCubature(const panzer::IntegrationRule & ir) const;
 
-
   private:
     bool alloc_arrays;
     std::string prefix;
@@ -175,8 +177,6 @@ namespace panzer {
 
     void generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim> & in_node_coordinates, const int in_num_cells,const SubcellConnectivity & face_connectivity);
     void getCubature(const PHX::MDField<Scalar,Cell,NODE,Dim> & in_node_coordinates, const int in_num_cells);
-    void getCubatureCV(const PHX::MDField<Scalar,Cell,NODE,Dim> & in_node_coordinates, const int in_num_cells);
-    void evaluateRemainingValues(const PHX::MDField<Scalar,Cell,NODE,Dim> & in_node_coordinates, const int in_num_cells);
     void evaluateValuesCV(const PHX::MDField<Scalar,Cell,NODE,Dim> & vertex_coordinates,const int in_num_cells);
   };
 
diff --git a/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp b/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
index 33e520f295f1..342f86da798c 100644
--- a/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
+++ b/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
@@ -93,27 +93,30 @@ namespace panzer {
     typedef panzer::ArrayTraits<double,PHX::MDField<double> >::size_type size_type;
     const size_type x = 0;
     const size_type y = 1;
-    for (size_type cell = 0; cell < node_coordinates.extent(0); ++cell) {
-      node_coordinates(cell,0,x) = 0.0;
-      node_coordinates(cell,0,y) = 0.0;
-      node_coordinates(cell,1,x) = 1.0;
-      node_coordinates(cell,1,y) = 0.0;
-      node_coordinates(cell,2,x) = 1.0;
-      node_coordinates(cell,2,y) = 1.0;
-      node_coordinates(cell,3,x) = 0.0;
-      node_coordinates(cell,3,y) = 1.0;
-    }
+    auto node_coordinates_k = node_coordinates.get_view();
+    Kokkos::parallel_for("initialize node coords",node_coordinates.extent(0),
+                         KOKKOS_LAMBDA (const int cell) {
+      node_coordinates_k(cell,0,x) = 0.0;
+      node_coordinates_k(cell,0,y) = 0.0;
+      node_coordinates_k(cell,1,x) = 1.0;
+      node_coordinates_k(cell,1,y) = 0.0;
+      node_coordinates_k(cell,2,x) = 1.0;
+      node_coordinates_k(cell,2,y) = 1.0;
+      node_coordinates_k(cell,3,x) = 0.0;
+      node_coordinates_k(cell,3,y) = 1.0;
+    });
 
     int_values.evaluateValues(node_coordinates);
     
     TEST_EQUALITY(int_values.ip_coordinates.extent(1), 4);
     double realspace_x_coord = (1.0/std::sqrt(3.0) + 1.0) / 2.0;
     double realspace_y_coord = (1.0/std::sqrt(3.0) + 1.0) / 2.0;
-    TEST_FLOATING_EQUALITY(int_values.ip_coordinates(0,0,0), 
+    auto tmp_coords = int_values.ip_coordinates;
+    auto tmp_coords_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),tmp_coords.get_view());
+    TEST_FLOATING_EQUALITY(tmp_coords_host(0,0,0), 
                            realspace_x_coord, 1.0e-8);
-    TEST_FLOATING_EQUALITY(int_values.ip_coordinates(0,0,1), 
+    TEST_FLOATING_EQUALITY(tmp_coords_host(0,0,1), 
                            realspace_y_coord, 1.0e-8);
-
   }
 
   TEUCHOS_UNIT_TEST(integration_values, control_volume)
@@ -158,16 +161,18 @@ namespace panzer {
     typedef panzer::ArrayTraits<double,PHX::MDField<double> >::size_type size_type;
     const size_type x = 0;
     const size_type y = 1;
-    for (size_type cell = 0; cell < node_coordinates.extent(0); ++cell) {
-      node_coordinates(cell,0,x) = 0.0;
-      node_coordinates(cell,0,y) = 0.0;
-      node_coordinates(cell,1,x) = 1.0;
-      node_coordinates(cell,1,y) = 0.0;
-      node_coordinates(cell,2,x) = 1.0;
-      node_coordinates(cell,2,y) = 1.0;
-      node_coordinates(cell,3,x) = 0.0;
-      node_coordinates(cell,3,y) = 1.0;
-    }
+    auto node_coordinates_k = node_coordinates.get_view();
+    Kokkos::parallel_for("initialize node coords",node_coordinates.extent(0),
+                         KOKKOS_LAMBDA (const int cell) {
+      node_coordinates_k(cell,0,x) = 0.0;
+      node_coordinates_k(cell,0,y) = 0.0;
+      node_coordinates_k(cell,1,x) = 1.0;
+      node_coordinates_k(cell,1,y) = 0.0;
+      node_coordinates_k(cell,2,x) = 1.0;
+      node_coordinates_k(cell,2,y) = 1.0;
+      node_coordinates_k(cell,3,x) = 0.0;
+      node_coordinates_k(cell,3,y) = 1.0;
+    });
 
     int_values_vol.evaluateValues(node_coordinates);
     int_values_side.evaluateValues(node_coordinates);
@@ -177,15 +182,19 @@ namespace panzer {
     TEST_EQUALITY(int_values_side.weighted_normals.extent(1), 4);
     double realspace_x_coord_1 = 0.25;
     double realspace_y_coord_1 = 0.25;
-    TEST_FLOATING_EQUALITY(int_values_vol.ip_coordinates(0,0,0), 
+    auto tmp_coords = int_values_vol.ip_coordinates;
+    auto tmp_coords_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),tmp_coords.get_view());
+    TEST_FLOATING_EQUALITY(tmp_coords_host(0,0,0), 
                            realspace_x_coord_1, 1.0e-8);
-    TEST_FLOATING_EQUALITY(int_values_vol.ip_coordinates(0,0,1), 
+    TEST_FLOATING_EQUALITY(tmp_coords_host(0,0,1), 
                            realspace_y_coord_1, 1.0e-8);
     double realspace_x_coord_2 = 0.5;
     double realspace_y_coord_2 = 0.25;
-    TEST_FLOATING_EQUALITY(int_values_side.ip_coordinates(0,0,0), 
+    auto tmp_side_coords = int_values_side.ip_coordinates;
+    auto tmp_side_coords_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),tmp_side_coords.get_view());
+    TEST_FLOATING_EQUALITY(tmp_side_coords_host(0,0,0), 
                            realspace_x_coord_2, 1.0e-8);
-    TEST_FLOATING_EQUALITY(int_values_side.ip_coordinates(0,0,1), 
+    TEST_FLOATING_EQUALITY(tmp_side_coords_host(0,0,1), 
                            realspace_y_coord_2, 1.0e-8);
 
   }
@@ -225,25 +234,29 @@ namespace panzer {
     typedef panzer::ArrayTraits<double,PHX::MDField<double> >::size_type size_type;
     const size_type x = 0;
     const size_type y = 1;
-    for (size_type cell = 0; cell < node_coordinates.extent(0); ++cell) {
-      node_coordinates(cell,0,x) = 0.0;
-      node_coordinates(cell,0,y) = 0.0;
-      node_coordinates(cell,1,x) = 1.0;
-      node_coordinates(cell,1,y) = 0.0;
-      node_coordinates(cell,2,x) = 1.0;
-      node_coordinates(cell,2,y) = 1.0;
-      node_coordinates(cell,3,x) = 0.0;
-      node_coordinates(cell,3,y) = 1.0;
-    }
+    auto node_coordinates_k = node_coordinates.get_view();
+    Kokkos::parallel_for("initialize node coords",node_coordinates.extent(0),
+                         KOKKOS_LAMBDA (const int cell) {
+      node_coordinates_k(cell,0,x) = 0.0;
+      node_coordinates_k(cell,0,y) = 0.0;
+      node_coordinates_k(cell,1,x) = 1.0;
+      node_coordinates_k(cell,1,y) = 0.0;
+      node_coordinates_k(cell,2,x) = 1.0;
+      node_coordinates_k(cell,2,y) = 1.0;
+      node_coordinates_k(cell,3,x) = 0.0;
+      node_coordinates_k(cell,3,y) = 1.0;
+    });
 
     int_values_bc.evaluateValues(node_coordinates);
+
+    auto ip_coords_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),int_values_bc.ip_coordinates.get_view());
     
     TEST_EQUALITY(int_values_bc.ip_coordinates.extent(1), 2);
     double realspace_x_coord_1 = 1.0;
     double realspace_y_coord_1 = 0.25;
-    TEST_FLOATING_EQUALITY(int_values_bc.ip_coordinates(0,0,0), 
+    TEST_FLOATING_EQUALITY(ip_coords_host(0,0,0), 
                            realspace_x_coord_1, 1.0e-8);
-    TEST_FLOATING_EQUALITY(int_values_bc.ip_coordinates(0,0,1), 
+    TEST_FLOATING_EQUALITY(ip_coords_host(0,0,1), 
                            realspace_y_coord_1, 1.0e-8);
 
   }
@@ -282,27 +295,29 @@ namespace panzer {
     typedef panzer::ArrayTraits<double,PHX::MDField<double> >::size_type size_type;
     const size_type x = 0;
     const size_type y = 1;
-    for (size_type cell = 0; cell < node_coordinates.extent(0); ++cell) {
-      node_coordinates(cell,0,x) = 0.0;
-      node_coordinates(cell,0,y) = 0.0;
-      node_coordinates(cell,1,x) = 1.0;
-      node_coordinates(cell,1,y) = 0.0;
-      node_coordinates(cell,2,x) = 1.5;
-      node_coordinates(cell,2,y) = 1.5;
-      node_coordinates(cell,3,x) = 0.0;
-      node_coordinates(cell,3,y) = 1.0;
-    }
+    auto node_coordinates_k = node_coordinates.get_view();
+    Kokkos::parallel_for("initialize node coords",node_coordinates.extent(0),
+                         KOKKOS_LAMBDA (const int cell) {
+      node_coordinates_k(cell,0,x) = 0.0;
+      node_coordinates_k(cell,0,y) = 0.0;
+      node_coordinates_k(cell,1,x) = 1.0;
+      node_coordinates_k(cell,1,y) = 0.0;
+      node_coordinates_k(cell,2,x) = 1.5;
+      node_coordinates_k(cell,2,y) = 1.5;
+      node_coordinates_k(cell,3,x) = 0.0;
+      node_coordinates_k(cell,3,y) = 1.0;
+    });
 
     int_values.evaluateValues(node_coordinates);
 
     // pull out arrays to look at
     
-    auto weighted_measure   = int_values.weighted_measure;
-    auto jac_det            = int_values.jac_det;
-    auto ref_ip_coordinates = int_values.ref_ip_coordinates;
-    auto ip_coordinates     = int_values.ip_coordinates;
-    auto jac                = int_values.jac;
-    auto jac_inv            = int_values.jac_inv;
+    auto weighted_measure   = Kokkos::create_mirror_view(int_values.weighted_measure.get_view());
+    auto jac_det            = Kokkos::create_mirror_view(int_values.jac_det.get_view());
+    auto ref_ip_coordinates = Kokkos::create_mirror_view(int_values.ref_ip_coordinates.get_view());
+    auto ip_coordinates     = Kokkos::create_mirror_view(int_values.ip_coordinates.get_view());
+    auto jac                = Kokkos::create_mirror_view(int_values.jac.get_view());
+    auto jac_inv            = Kokkos::create_mirror_view(int_values.jac_inv.get_view());
 
     int num_cells  = ip_coordinates.extent(0); 
     int num_points = ip_coordinates.extent(1); 
@@ -320,7 +335,13 @@ namespace panzer {
       int new_pt = 1;
 
       int_values.swapQuadraturePoints(tst_cell,org_pt,new_pt);
-   
+      Kokkos::deep_copy(weighted_measure,int_values.weighted_measure.get_view());
+      Kokkos::deep_copy(jac_det,int_values.jac_det.get_view());
+      Kokkos::deep_copy(ref_ip_coordinates,int_values.ref_ip_coordinates.get_view());
+      Kokkos::deep_copy(ip_coordinates,int_values.ip_coordinates.get_view());
+      Kokkos::deep_copy(jac,int_values.jac.get_view());
+      Kokkos::deep_copy(jac_inv,int_values.jac_inv.get_view());
+      
       TEST_EQUALITY(    weighted_measure(ref_cell,org_pt),     weighted_measure(tst_cell,new_pt));
       TEST_EQUALITY(             jac_det(ref_cell,org_pt),              jac_det(tst_cell,new_pt));
       TEST_EQUALITY(ref_ip_coordinates(ref_cell,org_pt,0), ref_ip_coordinates(tst_cell,new_pt,0));
@@ -343,6 +364,13 @@ namespace panzer {
       int new_pt = 3;
 
       int_values.swapQuadraturePoints(tst_cell,org_pt,new_pt);
+      Kokkos::deep_copy(weighted_measure,int_values.weighted_measure.get_view());
+      Kokkos::deep_copy(jac_det,int_values.jac_det.get_view());
+      Kokkos::deep_copy(ref_ip_coordinates,int_values.ref_ip_coordinates.get_view());
+      Kokkos::deep_copy(ip_coordinates,int_values.ip_coordinates.get_view());
+      Kokkos::deep_copy(jac,int_values.jac.get_view());
+      Kokkos::deep_copy(jac_inv,int_values.jac_inv.get_view());
+
    
       TEST_EQUALITY(    weighted_measure(ref_cell,org_pt),     weighted_measure(tst_cell,new_pt));
       TEST_EQUALITY(             jac_det(ref_cell,org_pt),              jac_det(tst_cell,new_pt));

From a4c9c227f49060e77d9dc1186fab33ce7b67dd3c Mon Sep 17 00:00:00 2001
From: Roger Pawlowski <rppawlo@sandia.gov>
Date: Thu, 1 Jul 2021 16:17:24 -0600
Subject: [PATCH 04/14] Panzer: cleanup cuda warnings

---
 packages/panzer/disc-fe/src/Panzer_L2Projection.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/packages/panzer/disc-fe/src/Panzer_L2Projection.cpp b/packages/panzer/disc-fe/src/Panzer_L2Projection.cpp
index fe3718d2097f..a6ade200128a 100644
--- a/packages/panzer/disc-fe/src/Panzer_L2Projection.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_L2Projection.cpp
@@ -115,8 +115,8 @@ namespace panzer {
 
           const auto basisValues = workset.getBasisValues(targetBasisDescriptor_,integrationDescriptor_);
 
-          const auto unweightedBasis = basisValues.basis_scalar;
-          const auto weightedBasis = basisValues.weighted_basis_scalar;
+          const auto unweightedBasis = basisValues.basis_scalar.get_static_view();
+          const auto weightedBasis = basisValues.weighted_basis_scalar.get_static_view();
 
           // Offsets (this assumes UVM, need to fix)
           const std::vector<panzer::LocalOrdinal>& offsets = targetGlobalIndexer_->getGIDFieldOffsets(block,fieldIndex);
@@ -215,8 +215,8 @@ namespace panzer {
 
           const auto basisValues = workset.getBasisValues(targetBasisDescriptor_,integrationDescriptor_);
 
-          const auto unweightedBasis = basisValues.basis_vector;
-          const auto weightedBasis = basisValues.weighted_basis_vector;
+          const auto unweightedBasis = basisValues.basis_vector.get_static_view();
+          const auto weightedBasis = basisValues.weighted_basis_vector.get_static_view();
 
           // Offsets (this assumes UVM, need to fix)
           const std::vector<panzer::LocalOrdinal>& offsets = targetGlobalIndexer_->getGIDFieldOffsets(block,fieldIndex);

From cb4759502b872aee5c498f0473a38a4dd21c5865 Mon Sep 17 00:00:00 2001
From: Roger Pawlowski <rppawlo@sandia.gov>
Date: Fri, 2 Jul 2021 08:43:25 -0600
Subject: [PATCH 05/14] Panzer: integration values swapQP working on device

---
 .../disc-fe/src/Panzer_IntegrationValues2.cpp | 59 +++++++------
 .../disc-fe/src/Panzer_IntegrationValues2.hpp |  6 +-
 .../src/Panzer_IntegrationValues2_impl.hpp    | 84 +++++++++++++++++++
 .../test/core_tests/integration_values2.cpp   | 22 ++++-
 4 files changed, 141 insertions(+), 30 deletions(-)
 create mode 100644 packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp

diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
index 1e9660a24e31..5f5c06df5b03 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
@@ -58,6 +58,9 @@
 #include "Panzer_Traits.hpp"
 #include "Panzer_SubcellConnectivity.hpp"
 
+// For device member functions to avoid CUDA RDC in unit testing
+#include "Panzer_IntegrationValues2_impl.hpp"
+
 // ***********************************************************
 // * Specializations of setupArrays() for different array types
 // ***********************************************************
@@ -372,12 +375,12 @@ convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scal
     binormal[2] = 0.;
   }
 }
-  
+  /*  
 template <typename Scalar>
 void IntegrationValues2<Scalar>::
 swapQuadraturePoints(int cell,
                      int a,
-                     int b)
+                     int b) const
 {
   const int new_cell_point = a;
   const int old_cell_point = b;
@@ -385,60 +388,62 @@ swapQuadraturePoints(int cell,
   const int cell_dim = ref_ip_coordinates.extent(2);
 
 #ifdef PANZER_DEBUG
-  TEUCHOS_ASSERT(cell < ip_coordinates.extent_int(0));
-  TEUCHOS_ASSERT(a < ip_coordinates.extent_int(1));
-  TEUCHOS_ASSERT(b < ip_coordinates.extent_int(1));
-  TEUCHOS_ASSERT(cell >= 0);
-  TEUCHOS_ASSERT(a >= 0);
-  TEUCHOS_ASSERT(b >= 0);
+  KOKKOS_ASSERT(cell < ip_coordinates.extent_int(0));
+  KOKKOS_ASSERT(a < ip_coordinates.extent_int(1));
+  KOKKOS_ASSERT(b < ip_coordinates.extent_int(1));
+  KOKKOS_ASSERT(cell >= 0);
+  KOKKOS_ASSERT(a >= 0);
+  KOKKOS_ASSERT(b >= 0);
 #endif
-  
-  Scalar hold;
 
-  hold = weighted_measure(cell,new_cell_point);
+  // Using scratch_for_compute_side_measure instead of hold. This
+  // works around UVM issues of DFAD temporaries in device code.
+  // Scalar hold;
+
+  scratch_for_compute_side_measure(0) = weighted_measure(cell,new_cell_point);
   weighted_measure(cell,new_cell_point) = weighted_measure(cell,old_cell_point);
-  weighted_measure(cell,old_cell_point) = hold;
+  weighted_measure(cell,old_cell_point) = scratch_for_compute_side_measure(0);
 
-  hold = jac_det(cell,new_cell_point);
+  scratch_for_compute_side_measure(0) = jac_det(cell,new_cell_point);
   jac_det(cell,new_cell_point) = jac_det(cell,old_cell_point);
-  jac_det(cell,old_cell_point) = hold;
+  jac_det(cell,old_cell_point) = scratch_for_compute_side_measure(0);
 
   for(int dim=0;dim<cell_dim;++dim){
 
-    hold = ref_ip_coordinates(cell,new_cell_point,dim);
+    scratch_for_compute_side_measure(0) = ref_ip_coordinates(cell,new_cell_point,dim);
     ref_ip_coordinates(cell,new_cell_point,dim) = ref_ip_coordinates(cell,old_cell_point,dim);
-    ref_ip_coordinates(cell,old_cell_point,dim) = hold;
+    ref_ip_coordinates(cell,old_cell_point,dim) = scratch_for_compute_side_measure(0);
 
-    hold = ip_coordinates(cell,new_cell_point,dim);
+    scratch_for_compute_side_measure(0) = ip_coordinates(cell,new_cell_point,dim);
     ip_coordinates(cell,new_cell_point,dim) = ip_coordinates(cell,old_cell_point,dim);
-    ip_coordinates(cell,old_cell_point,dim) = hold;
+    ip_coordinates(cell,old_cell_point,dim) = scratch_for_compute_side_measure(0);
 
-    hold = surface_normals(cell,new_cell_point,dim);
+    scratch_for_compute_side_measure(0) = surface_normals(cell,new_cell_point,dim);
     surface_normals(cell,new_cell_point,dim) = surface_normals(cell,old_cell_point,dim);
-    surface_normals(cell,old_cell_point,dim) = hold;
+    surface_normals(cell,old_cell_point,dim) = scratch_for_compute_side_measure(0);
 
     for(int dim2=0;dim2<cell_dim;++dim2){
 
-      hold = jac(cell,new_cell_point,dim,dim2);
+      scratch_for_compute_side_measure(0) = jac(cell,new_cell_point,dim,dim2);
       jac(cell,new_cell_point,dim,dim2) = jac(cell,old_cell_point,dim,dim2);
-      jac(cell,old_cell_point,dim,dim2) = hold;
+      jac(cell,old_cell_point,dim,dim2) = scratch_for_compute_side_measure(0);
 
-      hold = jac_inv(cell,new_cell_point,dim,dim2);
+      scratch_for_compute_side_measure(0) = jac_inv(cell,new_cell_point,dim,dim2);
       jac_inv(cell,new_cell_point,dim,dim2) = jac_inv(cell,old_cell_point,dim,dim2);
-      jac_inv(cell,old_cell_point,dim,dim2) = hold;
+      jac_inv(cell,old_cell_point,dim,dim2) = scratch_for_compute_side_measure(0);
     }
   }
 
   // Rotation matrices are always in 3D
   for(int dim=0; dim<3; ++dim){
     for(int dim2=0; dim2<3; ++dim2){
-      hold = surface_rotation_matrices(cell,new_cell_point,dim,dim2);
+      scratch_for_compute_side_measure(0) = surface_rotation_matrices(cell,new_cell_point,dim,dim2);
       surface_rotation_matrices(cell,new_cell_point,dim,dim2) = surface_rotation_matrices(cell,old_cell_point,dim,dim2);
-      surface_rotation_matrices(cell,old_cell_point,dim,dim2) = hold;
+      surface_rotation_matrices(cell,old_cell_point,dim,dim2) = scratch_for_compute_side_measure(0);
     }
   }
 }
-
+  */
 template <typename Scalar>
 void IntegrationValues2<Scalar>::
 generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordinates,
diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
index 5b0d10854f26..73f6e8152726 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
@@ -155,13 +155,17 @@ namespace panzer {
      * \param[in] cell   Cell index
      * \param[in] a      Quadrature point a
      * \param[in] b      Quadrature point b
+     *
+     * NOTE: this is a non-const function but we had to make it const to run on device.
      */
-    void swapQuadraturePoints(int cell,int a,int b);
+    KOKKOS_INLINE_FUNCTION
+    void swapQuadraturePoints(int cell,int a,int b) const;
 
     static void convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scalar binormal[3]);
 
     /// This should be a private method, but using lambdas on cuda forces this to be public.
     void evaluateRemainingValues(const PHX::MDField<Scalar,Cell,NODE,Dim> & in_node_coordinates, const int in_num_cells);
+
     /// This should be a private method, but using lambdas on cuda forces this to be public.
     void getCubatureCV(const PHX::MDField<Scalar,Cell,NODE,Dim> & in_node_coordinates, const int in_num_cells);
 
diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp
new file mode 100644
index 000000000000..3b1becc16291
--- /dev/null
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp
@@ -0,0 +1,84 @@
+#ifndef PANZER_INTEGRATION_VALUES_2_IMPL_HPP
+#define PANZER_INTEGRATION_VALUES_2_IMPL_HPP
+
+/* Normally, we would fold this definition into the .cpp file since we
+   use ETI to compile this object. However, for unit testing the stand
+   alone device function swapQuadraturePoints, we need the definition
+   at the calling point to avoid requiring relocatable device code for
+   cuda. So for the IntegrationValues2 object, if there is a
+   standalone device member function, put it in this file (for unit
+   testing).
+ */
+namespace panzer {
+  
+template <typename Scalar>
+void IntegrationValues2<Scalar>::
+swapQuadraturePoints(int cell,
+                     int a,
+                     int b) const
+{
+  const int new_cell_point = a;
+  const int old_cell_point = b;
+
+  const int cell_dim = ref_ip_coordinates.extent(2);
+
+#ifdef PANZER_DEBUG
+  KOKKOS_ASSERT(cell < ip_coordinates.extent_int(0));
+  KOKKOS_ASSERT(a < ip_coordinates.extent_int(1));
+  KOKKOS_ASSERT(b < ip_coordinates.extent_int(1));
+  KOKKOS_ASSERT(cell >= 0);
+  KOKKOS_ASSERT(a >= 0);
+  KOKKOS_ASSERT(b >= 0);
+#endif
+
+  // Using scratch_for_compute_side_measure instead of hold. This
+  // works around UVM issues of DFAD temporaries in device code.
+  // Scalar hold;
+
+  scratch_for_compute_side_measure(0) = weighted_measure(cell,new_cell_point);
+  weighted_measure(cell,new_cell_point) = weighted_measure(cell,old_cell_point);
+  weighted_measure(cell,old_cell_point) = scratch_for_compute_side_measure(0);
+
+  scratch_for_compute_side_measure(0) = jac_det(cell,new_cell_point);
+  jac_det(cell,new_cell_point) = jac_det(cell,old_cell_point);
+  jac_det(cell,old_cell_point) = scratch_for_compute_side_measure(0);
+
+  for(int dim=0;dim<cell_dim;++dim){
+
+    scratch_for_compute_side_measure(0) = ref_ip_coordinates(cell,new_cell_point,dim);
+    ref_ip_coordinates(cell,new_cell_point,dim) = ref_ip_coordinates(cell,old_cell_point,dim);
+    ref_ip_coordinates(cell,old_cell_point,dim) = scratch_for_compute_side_measure(0);
+
+    scratch_for_compute_side_measure(0) = ip_coordinates(cell,new_cell_point,dim);
+    ip_coordinates(cell,new_cell_point,dim) = ip_coordinates(cell,old_cell_point,dim);
+    ip_coordinates(cell,old_cell_point,dim) = scratch_for_compute_side_measure(0);
+
+    scratch_for_compute_side_measure(0) = surface_normals(cell,new_cell_point,dim);
+    surface_normals(cell,new_cell_point,dim) = surface_normals(cell,old_cell_point,dim);
+    surface_normals(cell,old_cell_point,dim) = scratch_for_compute_side_measure(0);
+
+    for(int dim2=0;dim2<cell_dim;++dim2){
+
+      scratch_for_compute_side_measure(0) = jac(cell,new_cell_point,dim,dim2);
+      jac(cell,new_cell_point,dim,dim2) = jac(cell,old_cell_point,dim,dim2);
+      jac(cell,old_cell_point,dim,dim2) = scratch_for_compute_side_measure(0);
+
+      scratch_for_compute_side_measure(0) = jac_inv(cell,new_cell_point,dim,dim2);
+      jac_inv(cell,new_cell_point,dim,dim2) = jac_inv(cell,old_cell_point,dim,dim2);
+      jac_inv(cell,old_cell_point,dim,dim2) = scratch_for_compute_side_measure(0);
+    }
+  }
+
+  // Rotation matrices are always in 3D
+  for(int dim=0; dim<3; ++dim){
+    for(int dim2=0; dim2<3; ++dim2){
+      scratch_for_compute_side_measure(0) = surface_rotation_matrices(cell,new_cell_point,dim,dim2);
+      surface_rotation_matrices(cell,new_cell_point,dim,dim2) = surface_rotation_matrices(cell,old_cell_point,dim,dim2);
+      surface_rotation_matrices(cell,old_cell_point,dim,dim2) = scratch_for_compute_side_measure(0);
+    }
+  }
+}
+
+}
+
+#endif
diff --git a/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp b/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
index 342f86da798c..642a07591c08 100644
--- a/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
+++ b/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
@@ -48,6 +48,7 @@
 #include "Panzer_CellData.hpp"
 #include "Panzer_IntegrationRule.hpp"
 #include "Panzer_IntegrationValues2.hpp"
+#include "Panzer_IntegrationValues2_impl.hpp" // for unit testing wihtout CUDA RDC
 #include "Panzer_ArrayTraits.hpp"
 #include "Panzer_CommonArrayFactories.hpp"
 #include "Panzer_SubcellConnectivity.hpp"
@@ -261,6 +262,21 @@ namespace panzer {
 
   }
 
+  // This should be a simple lambda but the RCPs and std::vector in
+  // the int_values triggers warnings for calling host code form a
+  // host/device function. We eliminate the warnings by using a
+  // functor that is memcopied to device.
+  template<typename Scalar>
+  struct DoSwapOnDevice {
+    int cell_,orig_pt_,new_pt_;
+    panzer::IntegrationValues2<Scalar> int_values_;
+    DoSwapOnDevice(int cell,int orig_pt,int new_pt,panzer::IntegrationValues2<Scalar>& int_values)
+      : cell_(cell),orig_pt_(orig_pt),new_pt_(new_pt),int_values_(int_values){}
+    KOKKOS_INLINE_FUNCTION void operator() (const int ) const {
+      int_values_.swapQuadraturePoints(cell_,orig_pt_,new_pt_);
+    }
+  };
+
   TEUCHOS_UNIT_TEST(integration_values, quadpt_swap)
   {    
     Teuchos::RCP<shards::CellTopology> topo = 
@@ -334,7 +350,8 @@ namespace panzer {
       int org_pt = 0;
       int new_pt = 1;
 
-      int_values.swapQuadraturePoints(tst_cell,org_pt,new_pt);
+      Kokkos::parallel_for("swap qp",1,DoSwapOnDevice<double>(tst_cell,org_pt,new_pt,int_values));
+
       Kokkos::deep_copy(weighted_measure,int_values.weighted_measure.get_view());
       Kokkos::deep_copy(jac_det,int_values.jac_det.get_view());
       Kokkos::deep_copy(ref_ip_coordinates,int_values.ref_ip_coordinates.get_view());
@@ -363,7 +380,8 @@ namespace panzer {
       int org_pt = 1;
       int new_pt = 3;
 
-      int_values.swapQuadraturePoints(tst_cell,org_pt,new_pt);
+      Kokkos::parallel_for("swap qp",1,DoSwapOnDevice<double>(tst_cell,org_pt,new_pt,int_values));
+
       Kokkos::deep_copy(weighted_measure,int_values.weighted_measure.get_view());
       Kokkos::deep_copy(jac_det,int_values.jac_det.get_view());
       Kokkos::deep_copy(ref_ip_coordinates,int_values.ref_ip_coordinates.get_view());

From bc55f3079880cb862669c5fe843ec4b714ac6da5 Mon Sep 17 00:00:00 2001
From: Roger Pawlowski <rppawlo@sandia.gov>
Date: Fri, 2 Jul 2021 15:05:11 -0600
Subject: [PATCH 06/14] Panzer: checkpoint progress on uvm removal from int
 values

---
 .../disc-fe/src/Panzer_IntegrationValues2.cpp | 37 +++++----
 .../disc-fe/src/Panzer_IntegrationValues2.hpp |  4 +-
 .../src/Panzer_SubcellConnectivity.cpp        | 44 ++++++++---
 .../test/core_tests/integration_values2.cpp   | 79 +++++++++++--------
 4 files changed, 104 insertions(+), 60 deletions(-)

diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
index 5f5c06df5b03..f174d1de214d 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
@@ -464,14 +464,14 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
   {
     const int num_nodes = in_node_coordinates.extent(1);
     const int num_dims = in_node_coordinates.extent(2);
+    auto node_coordinates_k = node_coordinates.get_view();
+    auto in_node_coordinates_k = in_node_coordinates.get_view();
 
-    for(int cell=0; cell<num_cells; ++cell){
-      for(int node=0; node<num_nodes; ++node){
-        for(int dim=0; dim<num_dims; ++dim){
-          node_coordinates(cell,node,dim) = in_node_coordinates(cell,node,dim);
-        }
-      }
-    }
+    Kokkos::MDRangePolicy<PHX::Device::execution_space,Kokkos::Rank<3>> policy({0,0,0},{num_cells,num_nodes,num_dims});
+    Kokkos::parallel_for("node_coordinates",policy,KOKKOS_LAMBDA (const int cell, const int node, const int dim) {
+          node_coordinates_k(cell,node,dim) = in_node_coordinates_k(cell,node,dim);
+    });
+    PHX::Device::execution_space().fence();
   }
 
   // NOTE: We are assuming that each face can have a different number of points.
@@ -497,8 +497,12 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
       if(cell_dim==1){
         tmp_side_cub_weights = Kokkos::DynRankView<double,PHX::Device>("tmp_side_cub_weights",num_points_on_face);
         tmp_side_cub_points = Kokkos::DynRankView<double,PHX::Device>("cell_tmp_side_cub_points",num_points_on_face,cell_dim);
-        tmp_side_cub_weights(0)=1.;
-        tmp_side_cub_points(0,0) = (subcell_index==0)? -1. : 1.;
+        auto tmp_side_cub_weights_host = Kokkos::create_mirror_view(tmp_side_cub_weights);
+        auto tmp_side_cub_points_host = Kokkos::create_mirror_view(tmp_side_cub_points);
+        tmp_side_cub_weights_host(0)=1.;
+        tmp_side_cub_points_host(0,0) = (subcell_index==0)? -1. : 1.;
+        Kokkos::deep_copy(tmp_side_cub_weights,tmp_side_cub_weights_host);
+        Kokkos::deep_copy(tmp_side_cub_points,tmp_side_cub_points_host);
       } else {
 
         // Get the face topology from the cell topology
@@ -523,12 +527,17 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
                                        cell_topology);
       }
 
-
-      for(int local_point=0;local_point<num_points_on_face;++local_point){
-        const int point = point_offset + local_point;
-        for(int dim=0;dim<cell_dim;++dim){
-          cub_points(point,dim) = tmp_side_cub_points(local_point,dim);
+      // Do this on host
+      {
+        auto tmp_side_cub_points_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),tmp_side_cub_points);
+        auto cub_points_host = Kokkos::create_mirror_view(cub_points.get_static_view());
+        for(int local_point=0;local_point<num_points_on_face;++local_point){
+          const int point = point_offset + local_point;
+          for(int dim=0;dim<cell_dim;++dim){
+            cub_points_host(point,dim) = tmp_side_cub_points_host(local_point,dim);
+          }
         }
+        Kokkos::deep_copy(cub_points.get_static_view(),cub_points_host);
       }
 
 
diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
index 73f6e8152726..882095913c4c 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
@@ -169,6 +169,9 @@ namespace panzer {
     /// This should be a private method, but using lambdas on cuda forces this to be public.
     void getCubatureCV(const PHX::MDField<Scalar,Cell,NODE,Dim> & in_node_coordinates, const int in_num_cells);
 
+    /// This should be a private method, but using lambdas on cuda forces this to be public.
+    void generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim> & in_node_coordinates, const int in_num_cells,const SubcellConnectivity & face_connectivity);
+
   protected:
 
     // TODO: Make this a utility function that only exists in source file
@@ -179,7 +182,6 @@ namespace panzer {
     std::string prefix;
     std::vector<PHX::index_size_type> ddims_;
 
-    void generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim> & in_node_coordinates, const int in_num_cells,const SubcellConnectivity & face_connectivity);
     void getCubature(const PHX::MDField<Scalar,Cell,NODE,Dim> & in_node_coordinates, const int in_num_cells);
     void evaluateValuesCV(const PHX::MDField<Scalar,Cell,NODE,Dim> & vertex_coordinates,const int in_num_cells);
   };
diff --git a/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.cpp b/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.cpp
index ae83948aa57e..7dcafa39ef84 100644
--- a/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.cpp
@@ -64,21 +64,39 @@ setup(const panzer::LocalMeshPartition & partition)
   _cell_to_subcells_adj = PHX::View<int*>("cell_to_subcells_adj", num_cells+1);
   _cell_to_subcells = PHX::View<int*>("cell_to_subcells", num_cells*num_faces_per_cell);
 
-  _subcell_to_cells_adj(0)=0;
-  for(int face=0;face<num_faces;++face){
-    _subcell_to_cells_adj(face+1) =_subcell_to_cells_adj(face)+num_cells_per_face;
-    _subcell_to_cells(num_cells_per_face*face + 0) = partition.face_to_cells(face,0);
-    _subcell_to_cells(num_cells_per_face*face + 1) = partition.face_to_cells(face,1);
-    _subcell_to_local_subcells(num_cells_per_face*face + 0) = partition.face_to_lidx(face,0);
-    _subcell_to_local_subcells(num_cells_per_face*face + 1) = partition.face_to_lidx(face,1);
+  // This line not needed since kokkos initializes the arrays above to zero
+  //_subcell_to_cells_adj(0)=0;
+
+  {
+    auto face_to_cells = partition.face_to_cells;
+    auto face_to_lidx = partition.face_to_lidx;
+    auto subcell_to_cells_adj = _subcell_to_cells_adj;
+    auto subcell_to_cells = _subcell_to_cells;
+    auto subcell_to_local_subcells = _subcell_to_local_subcells;
+    Kokkos::parallel_for("subcell connectivity 0",num_faces,KOKKOS_LAMBDA (const int face) {
+      subcell_to_cells_adj(face+1) = (face * num_cells_per_face) + num_cells_per_face;
+      subcell_to_cells(num_cells_per_face*face + 0) = face_to_cells(face,0);
+      subcell_to_cells(num_cells_per_face*face + 1) = face_to_cells(face,1);
+      subcell_to_local_subcells(num_cells_per_face*face + 0) = face_to_lidx(face,0);
+      subcell_to_local_subcells(num_cells_per_face*face + 1) = face_to_lidx(face,1);
+    });
+    PHX::Device::execution_space().fence();
   }
 
-  _cell_to_subcells_adj(0)=0;
-  for(int cell=0;cell<num_cells;++cell){
-    _cell_to_subcells_adj(cell+1) =_cell_to_subcells_adj(cell)+num_faces_per_cell;
-    for(int local_face=0;local_face<num_faces_per_cell;++local_face){
-      _cell_to_subcells(num_faces_per_cell*cell+local_face) = partition.cell_to_faces(cell,local_face);
-    }
+  // This line not needed since kokkos initializes the arrays above to zero
+  //_cell_to_subcells_adj(0)=0;
+
+  {
+    auto cell_to_faces = partition.cell_to_faces;
+    auto cell_to_subcells_adj = _cell_to_subcells_adj;
+    auto cell_to_subcells = _cell_to_subcells;
+    Kokkos::parallel_for("subcell connectivity 1",num_cells,KOKKOS_LAMBDA (const int cell) {
+      cell_to_subcells_adj(cell+1) = (cell * num_faces_per_cell) + num_faces_per_cell;
+      for(int local_face=0;local_face<num_faces_per_cell;++local_face){
+        cell_to_subcells(num_faces_per_cell*cell+local_face) = cell_to_faces(cell,local_face);
+      }
+    });
+    PHX::Device::execution_space().fence();
   }
 
 }
diff --git a/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp b/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
index 642a07591c08..46b0ab93c325 100644
--- a/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
+++ b/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
@@ -441,19 +441,24 @@ namespace panzer {
       mesh.num_virtual_cells = 0;
       mesh.cell_topology = Teuchos::rcp(new shards::CellTopology(shards::getCellTopologyData< shards::Quadrilateral<4> >()));
       mesh.local_cells = PHX::View<panzer::LocalOrdinal*>("local_cells",2);
-      mesh.local_cells(0) = 0;
-      mesh.local_cells(1) = 1;
+      auto local_cells_host = Kokkos::create_mirror_view(mesh.local_cells);
+      local_cells_host(0) = 0;
+      local_cells_host(1) = 1;
+      Kokkos::deep_copy(mesh.local_cells,local_cells_host);
       mesh.cell_vertices = PHX::View<double***>("cell_vertices",2,4,2);
 
       PHX::View<double**> coordinates("coordinates",6,2);
-      coordinates(0,0) = 1.; coordinates(0,1) = 3.;
-      coordinates(1,0) = 3.; coordinates(1,1) = 3.;
-      coordinates(2,0) = 5.; coordinates(2,1) = 3.;
-      coordinates(3,0) = 2.; coordinates(3,1) = 1.;
-      coordinates(4,0) = 3.; coordinates(4,1) = 1.;
-      coordinates(5,0) = 4.; coordinates(5,1) = 1.;
-      
-#define SET_NC(cell,node,vertex) mesh.cell_vertices(cell,node,0) = coordinates(vertex,0); mesh.cell_vertices(cell,node,1) = coordinates(vertex,1);
+      auto coordinates_host = Kokkos::create_mirror_view(coordinates);
+      coordinates_host(0,0) = 1.; coordinates_host(0,1) = 3.;
+      coordinates_host(1,0) = 3.; coordinates_host(1,1) = 3.;
+      coordinates_host(2,0) = 5.; coordinates_host(2,1) = 3.;
+      coordinates_host(3,0) = 2.; coordinates_host(3,1) = 1.;
+      coordinates_host(4,0) = 3.; coordinates_host(4,1) = 1.;
+      coordinates_host(5,0) = 4.; coordinates_host(5,1) = 1.;
+      Kokkos::deep_copy(coordinates,coordinates_host);
+
+      auto cell_vertices_host = Kokkos::create_mirror_view(mesh.cell_vertices);
+#define SET_NC(cell,node,vertex) cell_vertices_host(cell,node,0) = coordinates_host(vertex,0); cell_vertices_host(cell,node,1) = coordinates_host(vertex,1);
       SET_NC(0,0, 0);
       SET_NC(0,1, 3);
       SET_NC(0,2, 4);
@@ -463,27 +468,33 @@ namespace panzer {
       SET_NC(1,2, 5);
       SET_NC(1,3, 2);
 #undef SET_NC
+      Kokkos::deep_copy(mesh.cell_vertices,cell_vertices_host);
 
       mesh.face_to_cells = PHX::View<panzer::LocalOrdinal*[2]>("face_to_cells",6);
-      mesh.face_to_cells(0,0) = 0; mesh.face_to_cells(0,1) =  1;
-      mesh.face_to_cells(1,0) = 0; mesh.face_to_cells(1,1) = -1;
-      mesh.face_to_cells(2,0) = 0; mesh.face_to_cells(2,1) =  1;
-      mesh.face_to_cells(3,0) = 0; mesh.face_to_cells(3,1) = -1;
-      mesh.face_to_cells(4,0) = 1; mesh.face_to_cells(4,1) = -1;
-      mesh.face_to_cells(5,0) = 1; mesh.face_to_cells(5,1) = -1;
+      auto face_to_cells_host = Kokkos::create_mirror_view(mesh.face_to_cells);
+      face_to_cells_host(0,0) = 0; face_to_cells_host(0,1) =  1;
+      face_to_cells_host(1,0) = 0; face_to_cells_host(1,1) = -1;
+      face_to_cells_host(2,0) = 0; face_to_cells_host(2,1) =  1;
+      face_to_cells_host(3,0) = 0; face_to_cells_host(3,1) = -1;
+      face_to_cells_host(4,0) = 1; face_to_cells_host(4,1) = -1;
+      face_to_cells_host(5,0) = 1; face_to_cells_host(5,1) = -1;
+      Kokkos::deep_copy(mesh.face_to_cells,face_to_cells_host);
 
       mesh.face_to_lidx = PHX::View<panzer::LocalOrdinal*[2]>("face_to_lidx",6);
-      mesh.face_to_lidx(0,0) = 0; mesh.face_to_lidx(0,1) =  2;
-      mesh.face_to_lidx(1,0) = 1; mesh.face_to_lidx(1,1) = -1;
-      mesh.face_to_lidx(2,0) = 2; mesh.face_to_lidx(2,1) =  0;
-      mesh.face_to_lidx(3,0) = 3; mesh.face_to_lidx(3,1) = -1;
-      mesh.face_to_lidx(4,0) = 1; mesh.face_to_lidx(4,1) = -1;
-      mesh.face_to_lidx(5,0) = 3; mesh.face_to_lidx(5,1) = -1;
+      auto face_to_lidx_host = Kokkos::create_mirror_view(mesh.face_to_lidx);
+      face_to_lidx_host(0,0) = 0; face_to_lidx_host(0,1) =  2;
+      face_to_lidx_host(1,0) = 1; face_to_lidx_host(1,1) = -1;
+      face_to_lidx_host(2,0) = 2; face_to_lidx_host(2,1) =  0;
+      face_to_lidx_host(3,0) = 3; face_to_lidx_host(3,1) = -1;
+      face_to_lidx_host(4,0) = 1; face_to_lidx_host(4,1) = -1;
+      face_to_lidx_host(5,0) = 3; face_to_lidx_host(5,1) = -1;
+      Kokkos::deep_copy(mesh.face_to_lidx,face_to_lidx_host);
 
       mesh.cell_to_faces = PHX::View<panzer::LocalOrdinal**>("cell_to_faces",2,4);
-      mesh.cell_to_faces(0,0) = 0; mesh.cell_to_faces(0,1) = 1; mesh.cell_to_faces(0,2) = 2; mesh.cell_to_faces(0,3) = 3;
-      mesh.cell_to_faces(1,0) = 2; mesh.cell_to_faces(1,1) = 4; mesh.cell_to_faces(1,2) = 0; mesh.cell_to_faces(1,3) = 5;
-      
+      auto cell_to_faces_host = Kokkos::create_mirror_view(mesh.cell_to_faces);
+      cell_to_faces_host(0,0) = 0; cell_to_faces_host(0,1) = 1; cell_to_faces_host(0,2) = 2; cell_to_faces_host(0,3) = 3;
+      cell_to_faces_host(1,0) = 2; cell_to_faces_host(1,1) = 4; cell_to_faces_host(1,2) = 0; cell_to_faces_host(1,3) = 5;
+      Kokkos::deep_copy(mesh.cell_to_faces,cell_to_faces_host);
     }
    
     const auto id = panzer::IntegrationDescriptor(2, panzer::IntegrationDescriptor::SURFACE);
@@ -494,11 +505,15 @@ namespace panzer {
     // Fill node coordinates
     panzer::MDFieldArrayFactory af("prefix_",true);
     auto node_coordinates = af.buildStaticArray<double,Cell,NODE,Dim>("node_coordinates",2,4,2);
-
-    for(int i=0; i<mesh.cell_vertices.extent_int(0); ++i)
-      for(int j=0; j<mesh.cell_vertices.extent_int(1); ++j)
-        for(int k=0; k<mesh.cell_vertices.extent_int(2); ++k)
-          node_coordinates(i,j,k) = mesh.cell_vertices(i,j,k);
+    {
+      auto node_coordinates_host = Kokkos::create_mirror_view(node_coordinates.get_static_view());
+      auto cell_vertices_host = Kokkos::create_mirror_view(mesh.cell_vertices);
+      for(int i=0; i<mesh.cell_vertices.extent_int(0); ++i)
+        for(int j=0; j<mesh.cell_vertices.extent_int(1); ++j)
+          for(int k=0; k<mesh.cell_vertices.extent_int(2); ++k)
+            node_coordinates_host(i,j,k) = cell_vertices_host(i,j,k);
+      Kokkos::deep_copy(node_coordinates.get_static_view(),node_coordinates_host);
+    }
 
     // Make sure the integration values fail to construct without the subcell connectivity
     TEST_THROW(int_values.evaluateValues(node_coordinates), std::logic_error);
@@ -511,8 +526,8 @@ namespace panzer {
     int_values.evaluateValues(node_coordinates,-1,connectivity);
     
     // This test will focus on normals and points
-    const auto normals = int_values.surface_normals;
-    const auto points = int_values.ip_coordinates;
+    const auto normals = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),int_values.surface_normals.get_static_view());
+    const auto points = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),int_values.ip_coordinates.get_static_view());
     const int num_points_per_cell = points.extent_int(1);
     const int num_points_per_face = num_points_per_cell / 4;
 

From 0056ab0796645e0a97b2ed19241f397dae6c3932 Mon Sep 17 00:00:00 2001
From: Roger Pawlowski <rppawlo@sandia.gov>
Date: Tue, 6 Jul 2021 15:17:07 -0600
Subject: [PATCH 07/14] Panzer: remove uvm from surface rotation matrices

---
 .../disc-fe/src/Panzer_IntegrationValues2.cpp | 122 ++++++++++--------
 .../disc-fe/src/Panzer_IntegrationValues2.hpp |   2 +
 .../src/Panzer_IntegrationValues2_impl.hpp    |  85 +++++++++---
 3 files changed, 138 insertions(+), 71 deletions(-)

diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
index f174d1de214d..df0ef75090bc 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
@@ -324,6 +324,7 @@ getCubature(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordinates,
                                 *(int_rule->topology));
 }
 
+  /*
 template <typename Scalar>
 void IntegrationValues2<Scalar>::
 convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scalar binormal[3])
@@ -351,7 +352,7 @@ convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scal
     }
 
     const T t = sqrt(transverse[0]*transverse[0]+transverse[1]*transverse[1]+transverse[2]*transverse[2]);
-    TEUCHOS_ASSERT(t != 0.);
+    KOKKOS_ASSERT(t != 0.);
     for(int dim=0;dim<3;++dim){
       transverse[dim] /= t;
     }
@@ -375,6 +376,7 @@ convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scal
     binormal[2] = 0.;
   }
 }
+  */
   /*  
 template <typename Scalar>
 void IntegrationValues2<Scalar>::
@@ -556,12 +558,16 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
                              s_node_coordinates,
                              cell_topology);
 
+      PHX::Device::execution_space().fence();
+
       auto side_inverse_jacobian = Kokkos::DynRankView<Scalar,PHX::Device>("side_inv_jac",num_cells,num_points_on_face,cell_dim,cell_dim);
       cell_tools.setJacobianInv(side_inverse_jacobian, side_jacobian);
 
       auto side_det_jacobian = Kokkos::DynRankView<Scalar,PHX::Device>("side_det_jac",num_cells,num_points_on_face);
       cell_tools.setJacobianDet(side_det_jacobian, side_jacobian);
 
+      PHX::Device::execution_space().fence();
+
       // Calculate measures (quadrature weights in physical space) for this side
       auto side_weighted_measure = Kokkos::DynRankView<Scalar,PHX::Device>("side_weighted_measure",num_cells,num_points_on_face);
       if(cell_dim == 1){
@@ -571,65 +577,79 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
           computeEdgeMeasure(side_weighted_measure, side_jacobian, tmp_side_cub_weights,
                              subcell_index,cell_topology,
                              scratch_for_compute_side_measure.get_view());
+        PHX::Device::execution_space().fence();
       } else if(cell_dim == 3){
         Intrepid2::FunctionSpaceTools<PHX::Device::execution_space>::
           computeFaceMeasure(side_weighted_measure, side_jacobian, tmp_side_cub_weights,
                              subcell_index,cell_topology,
                              scratch_for_compute_side_measure.get_view());
+        PHX::Device::execution_space().fence();
       }
 
       // Calculate normals
       auto side_normals = Kokkos::DynRankView<Scalar,PHX::Device>("side_normals",num_cells,num_points_on_face,cell_dim);
       if(cell_dim == 1){
 
-        int other_subcell_index = (subcell_index==0) ? 1 : 0;
+        const int other_subcell_index = (subcell_index==0) ? 1 : 0;
+        auto in_node_coordinates_k = in_node_coordinates.get_view();
+        const auto min = std::numeric_limits<typename Sacado::ScalarType<Scalar>::type>::min();
 
-        for(int cell=0;cell<num_cells;++cell){
-          Scalar norm = (in_node_coordinates(cell,subcell_index,0) - in_node_coordinates(cell,other_subcell_index,0));
-          side_normals(cell,0,0) = norm / fabs(norm+std::numeric_limits<typename Sacado::ScalarType<Scalar>::type>::min());
-        }
+        Kokkos::parallel_for("compute normals 1D",num_cells,KOKKOS_LAMBDA (const int cell) {
+          Scalar norm = (in_node_coordinates_k(cell,subcell_index,0) - in_node_coordinates_k(cell,other_subcell_index,0));
+          side_normals(cell,0,0) = norm / fabs(norm + min);
+        });
 
       } else {
 
         cell_tools.getPhysicalSideNormals(side_normals,side_jacobian,subcell_index,cell_topology);
 
         // Normalize each normal
-        for(int cell=0;cell<num_cells;++cell){
-          for(int point=0;point<num_points_on_face;++point){
-            Scalar n = 0.;
+        Kokkos::MDRangePolicy<PHX::Device::execution_space,Kokkos::Rank<2>> policy({0,0},{num_cells,num_points_on_face});
+        Kokkos::parallel_for("Normalize the normals",policy,KOKKOS_LAMBDA (const int cell,const int point) {
+          Scalar n = 0.;
+          for(int dim=0;dim<cell_dim;++dim){
+            n += side_normals(cell,point,dim)*side_normals(cell,point,dim);
+          }
+          // If n is zero then this is - hopefully - a virtual cell
+          if(n > 0.){
+            n = std::sqrt(n);
             for(int dim=0;dim<cell_dim;++dim){
-              n += side_normals(cell,point,dim)*side_normals(cell,point,dim);
-            }
-            // If n is zero then this is - hopefully - a virtual cell
-            if(n > 0.){
-              n = std::sqrt(n);
-              for(int dim=0;dim<cell_dim;++dim){
-                side_normals(cell,point,dim) /= n;
-              }
+              side_normals(cell,point,dim) /= n;
             }
           }
-        }
+        });
 
       }
+      PHX::Device::execution_space().fence();
 
       // Now that we have all these wonderful values, lets copy them to the actual arrays
-      for(int cell=0;cell<num_cells;++cell){
-        for(int side_point=0; side_point<num_points_on_face;++side_point){
+      {
+        auto weighted_measure_k = weighted_measure.get_view();
+        auto jac_k = jac.get_view();
+        auto jac_inv_k = jac_inv.get_view();
+        auto jac_det_k = jac_det.get_view();
+        auto ref_ip_coordinates_k = ref_ip_coordinates.get_view();
+        auto ip_coordinates_k = ip_coordinates.get_view();
+        auto surface_normals_k = surface_normals.get_view();
+        auto cub_points_k = cub_points.get_view();
+        Kokkos::MDRangePolicy<PHX::Device::execution_space,Kokkos::Rank<2>> policy({0,0},{num_cells,num_points_on_face});
+        Kokkos::parallel_for("copy values",policy,KOKKOS_LAMBDA (const int cell,const int side_point) {
           const int cell_point = point_offset + side_point;
 
-          weighted_measure(cell,cell_point) = side_weighted_measure(cell,side_point);
-          jac_det(cell,cell_point) = side_det_jacobian(cell,side_point);
+          weighted_measure_k(cell,cell_point) = side_weighted_measure(cell,side_point);
+          jac_det_k(cell,cell_point) = side_det_jacobian(cell,side_point);
           for(int dim=0;dim<cell_dim;++dim){
-            ref_ip_coordinates(cell,cell_point,dim) = cub_points(cell_point,dim);
-            ip_coordinates(cell,cell_point,dim)     = side_ip_coordinates(cell,side_point,dim);
-            surface_normals(cell,cell_point,dim)    = side_normals(cell,side_point,dim);
+            ref_ip_coordinates_k(cell,cell_point,dim) = cub_points_k(cell_point,dim);
+            ip_coordinates_k(cell,cell_point,dim)     = side_ip_coordinates(cell,side_point,dim);
+            surface_normals_k(cell,cell_point,dim)    = side_normals(cell,side_point,dim);
 
             for(int dim2=0;dim2<cell_dim;++dim2){
-              jac(cell,cell_point,dim,dim2) = side_jacobian(cell,side_point,dim,dim2);
-              jac_inv(cell,cell_point,dim,dim2) = side_inverse_jacobian(cell,side_point,dim,dim2);
+              jac_k(cell,cell_point,dim,dim2) = side_jacobian(cell,side_point,dim,dim2);
+              jac_inv_k(cell,cell_point,dim,dim2) = side_inverse_jacobian(cell,side_point,dim,dim2);
             }
           }
-        }
+        });
+        PHX::Device::execution_space().fence();
       }
       point_offset += num_points_on_face;
     }
@@ -638,33 +658,27 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
   // We also need surface rotation matrices in order to enforce alignment
   {
     const int num_points = ir.getPointOffset(num_subcells);
-    Scalar normal[3];
-    Scalar transverse[3];
-    Scalar binormal[3];
-    for(int i=0;i<3;i++){normal[i]=0.;}
-    for(int i=0;i<3;i++){transverse[i]=0.;}
-    for(int i=0;i<3;i++){binormal[i]=0.;}
-    for(int cell=0; cell<num_cells; ++cell){
-      for(int subcell_index=0; subcell_index<num_subcells; ++subcell_index){
-        for(int point=0; point<num_points; ++point){
-          
-          for(int dim=0; dim<3; ++dim)
-            normal[dim] = 0.0;
-          
-          for(int dim=0; dim<cell_dim; ++dim){
-            normal[dim] = surface_normals(cell,point,dim);
-          }
-          
-          convertNormalToRotationMatrix(normal,transverse,binormal);
-          
-          for(int dim=0; dim<3; ++dim){
-            surface_rotation_matrices(cell,point,0,dim) = normal[dim];
-            surface_rotation_matrices(cell,point,1,dim) = transverse[dim];
-            surface_rotation_matrices(cell,point,2,dim) = binormal[dim];
-          }
-        }
+    auto surface_normals_k = surface_normals.get_view();
+    auto surface_rotation_matrices_k = surface_rotation_matrices.get_view();
+    Kokkos::MDRangePolicy<PHX::Device::execution_space,Kokkos::Rank<3>> policy({0,0,0},{num_cells,num_subcells,num_points});
+    Kokkos::parallel_for("create surface rotation matrices",policy,KOKKOS_LAMBDA (const int cell,const int subcell_index,const int point) {
+      Scalar normal[3];
+        
+      for(int dim=0; dim<cell_dim; ++dim){
+        normal[dim] = surface_normals_k(cell,point,dim);
       }
-    }
+      
+      Scalar transverse[3];
+      Scalar binormal[3];
+      convertNormalToRotationMatrix(normal,transverse,binormal);
+      
+      for(int dim=0; dim<3; ++dim){
+        surface_rotation_matrices_k(cell,point,0,dim) = normal[dim];
+        surface_rotation_matrices_k(cell,point,1,dim) = transverse[dim];
+        surface_rotation_matrices_k(cell,point,2,dim) = binormal[dim];
+      }
+    });
+    PHX::Device::execution_space().fence();
   }
 
   // =========================================================
diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
index 882095913c4c..178a5d45dd87 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
@@ -161,6 +161,8 @@ namespace panzer {
     KOKKOS_INLINE_FUNCTION
     void swapQuadraturePoints(int cell,int a,int b) const;
 
+    KOKKOS_INLINE_FUNCTION
+    //void convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scalar binormal[3]);
     static void convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scalar binormal[3]);
 
     /// This should be a private method, but using lambdas on cuda forces this to be public.
diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp
index 3b1becc16291..8106f1e9b507 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp
@@ -31,41 +31,40 @@ swapQuadraturePoints(int cell,
   KOKKOS_ASSERT(b >= 0);
 #endif
 
-  // Using scratch_for_compute_side_measure instead of hold. This
-  // works around UVM issues of DFAD temporaries in device code.
-  // Scalar hold;
+  // If this is a DFAD type, we will need to fix allocation to size the derivative array. 
+  Scalar hold;
 
-  scratch_for_compute_side_measure(0) = weighted_measure(cell,new_cell_point);
+  hold = weighted_measure(cell,new_cell_point);
   weighted_measure(cell,new_cell_point) = weighted_measure(cell,old_cell_point);
-  weighted_measure(cell,old_cell_point) = scratch_for_compute_side_measure(0);
+  weighted_measure(cell,old_cell_point) = hold;
 
-  scratch_for_compute_side_measure(0) = jac_det(cell,new_cell_point);
+  hold = jac_det(cell,new_cell_point);
   jac_det(cell,new_cell_point) = jac_det(cell,old_cell_point);
-  jac_det(cell,old_cell_point) = scratch_for_compute_side_measure(0);
+  jac_det(cell,old_cell_point) = hold;
 
   for(int dim=0;dim<cell_dim;++dim){
 
-    scratch_for_compute_side_measure(0) = ref_ip_coordinates(cell,new_cell_point,dim);
+    hold = ref_ip_coordinates(cell,new_cell_point,dim);
     ref_ip_coordinates(cell,new_cell_point,dim) = ref_ip_coordinates(cell,old_cell_point,dim);
-    ref_ip_coordinates(cell,old_cell_point,dim) = scratch_for_compute_side_measure(0);
+    ref_ip_coordinates(cell,old_cell_point,dim) = hold;
 
-    scratch_for_compute_side_measure(0) = ip_coordinates(cell,new_cell_point,dim);
+    hold = ip_coordinates(cell,new_cell_point,dim);
     ip_coordinates(cell,new_cell_point,dim) = ip_coordinates(cell,old_cell_point,dim);
-    ip_coordinates(cell,old_cell_point,dim) = scratch_for_compute_side_measure(0);
+    ip_coordinates(cell,old_cell_point,dim) = hold;
 
-    scratch_for_compute_side_measure(0) = surface_normals(cell,new_cell_point,dim);
+    hold = surface_normals(cell,new_cell_point,dim);
     surface_normals(cell,new_cell_point,dim) = surface_normals(cell,old_cell_point,dim);
-    surface_normals(cell,old_cell_point,dim) = scratch_for_compute_side_measure(0);
+    surface_normals(cell,old_cell_point,dim) = hold;
 
     for(int dim2=0;dim2<cell_dim;++dim2){
 
-      scratch_for_compute_side_measure(0) = jac(cell,new_cell_point,dim,dim2);
+      hold = jac(cell,new_cell_point,dim,dim2);
       jac(cell,new_cell_point,dim,dim2) = jac(cell,old_cell_point,dim,dim2);
-      jac(cell,old_cell_point,dim,dim2) = scratch_for_compute_side_measure(0);
+      jac(cell,old_cell_point,dim,dim2) = hold;
 
-      scratch_for_compute_side_measure(0) = jac_inv(cell,new_cell_point,dim,dim2);
+      hold = jac_inv(cell,new_cell_point,dim,dim2);
       jac_inv(cell,new_cell_point,dim,dim2) = jac_inv(cell,old_cell_point,dim,dim2);
-      jac_inv(cell,old_cell_point,dim,dim2) = scratch_for_compute_side_measure(0);
+      jac_inv(cell,old_cell_point,dim,dim2) = hold;
     }
   }
 
@@ -79,6 +78,58 @@ swapQuadraturePoints(int cell,
   }
 }
 
+template <typename Scalar>
+void IntegrationValues2<Scalar>::
+convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scalar binormal[3])
+{
+  using T = Scalar;
+  
+  const T n  = sqrt(normal[0]*normal[0]+normal[1]*normal[1]+normal[2]*normal[2]);
+
+  // If this fails then the geometry for this cell is probably undefined
+  if(n > 0.){
+    // Make sure transverse is not parallel to normal within some margin of error
+    transverse[0]=0.;transverse[1]=1.;transverse[2]=0.;
+    if(std::fabs(normal[0]*transverse[0]+normal[1]*transverse[1])>0.9){
+      transverse[0]=1.;transverse[1]=0.;
+    }
+
+    const T nt = normal[0]*transverse[0]+normal[1]*transverse[1]+normal[2]*transverse[2];
+
+    // Note normal has unit length
+    const T mult = nt/(n*n); // = nt
+
+    // Remove normal projection from transverse
+    for(int dim=0;dim<3;++dim){
+      transverse[dim] = transverse[dim] - mult * normal[dim];
+    }
+
+    const T t = sqrt(transverse[0]*transverse[0]+transverse[1]*transverse[1]+transverse[2]*transverse[2]);
+    KOKKOS_ASSERT(t != 0.);
+    for(int dim=0;dim<3;++dim){
+      transverse[dim] /= t;
+    }
+
+    // We assume a right handed system such that b = n \times t
+    binormal[0] = (normal[1] * transverse[2] - normal[2] * transverse[1]);
+    binormal[1] = (normal[2] * transverse[0] - normal[0] * transverse[2]);
+    binormal[2] = (normal[0] * transverse[1] - normal[1] * transverse[0]);
+
+    // Normalize binormal
+    const T b = sqrt(binormal[0]*binormal[0]+binormal[1]*binormal[1]+binormal[2]*binormal[2]);
+    for(int dim=0;dim<3;++dim){
+      binormal[dim] /= b;
+    }
+  } else {
+    transverse[0] = 0.;
+    transverse[1] = 0.;
+    transverse[2] = 0.;
+    binormal[0] = 0.;
+    binormal[1] = 0.;
+    binormal[2] = 0.;
+  }
+}
+
 }
 
 #endif

From 33f51d3e88523cc47687764aa45c6a2500696c53 Mon Sep 17 00:00:00 2001
From: Roger Pawlowski <rppawlo@sandia.gov>
Date: Wed, 7 Jul 2021 12:31:09 -0600
Subject: [PATCH 08/14] Panzer: checkpoint subcells work

---
 .../disc-fe/src/Panzer_IntegrationValues2.cpp |   4 +-
 .../src/Panzer_SubcellConnectivity.cpp        |  17 +++
 .../src/Panzer_SubcellConnectivity.hpp        | 105 ++++++++++++++----
 .../panzer/disc-fe/src/Panzer_Workset.cpp     |   1 +
 4 files changed, 105 insertions(+), 22 deletions(-)

diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
index df0ef75090bc..c7004a1e16f2 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
@@ -685,7 +685,7 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
   // Enforce alignment across surface quadrature points
   
   const int num_points = ip_coordinates.extent_int(1);
-  const int num_faces_per_cell = face_connectivity.numSubcellsOnCell(0);
+  const int num_faces_per_cell = face_connectivity.numSubcellsOnCellHost(0);
   const int num_points_per_face = num_points / num_faces_per_cell;
 
   // Now we need to align the cubature points for each face
@@ -703,7 +703,7 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
     std::vector<int> point_order(num_points_per_face);
 
     // Iterate through faces
-    for(int face=0; face<face_connectivity.numSubcells(); ++face){
+    for(int face=0; face<face_connectivity.numSubcellsHost(); ++face){
       // Cells for sides 0 and 1
       const int cell_0 = face_connectivity.cellForSubcell(face,0);
       const int cell_1 = face_connectivity.cellForSubcell(face,1);
diff --git a/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.cpp b/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.cpp
index 7dcafa39ef84..34fc08e5c38b 100644
--- a/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.cpp
@@ -56,7 +56,11 @@ setup(const panzer::LocalMeshPartition & partition)
   const int num_cells_per_face = 2;
 
   _num_subcells = num_faces;
+  _num_subcells_device = PHX::View<int*>("_num_subcells_device",1);
+  Kokkos::deep_copy(_num_subcells_device,_num_subcells);
   _num_cells = num_cells;
+  _num_cells_device = PHX::View<int*>("_num_cells_device",1);
+  Kokkos::deep_copy(_num_cells_device,_num_cells);
 
   _subcell_to_cells_adj = PHX::View<int*>("subcell_to_cells_adj", num_faces+1);
   _subcell_to_cells = PHX::View<int*>("subcell_to_cells", num_faces*num_cells_per_face);
@@ -64,6 +68,13 @@ setup(const panzer::LocalMeshPartition & partition)
   _cell_to_subcells_adj = PHX::View<int*>("cell_to_subcells_adj", num_cells+1);
   _cell_to_subcells = PHX::View<int*>("cell_to_subcells", num_cells*num_faces_per_cell);
 
+  // Host copies
+  _subcell_to_cells_adj_host = PHX::View<int*>::HostMirror("subcell_to_cells_adj_host", num_faces+1);
+  _subcell_to_cells_host = PHX::View<int*>::HostMirror("subcell_to_cells_host", num_faces*num_cells_per_face);
+  _subcell_to_local_subcells_host = PHX::View<int*>::HostMirror("subcell_to_local_subcells_host", num_faces*num_cells_per_face);
+  _cell_to_subcells_adj_host = PHX::View<int*>::HostMirror("cell_to_subcells_adj_host", num_cells+1);
+  _cell_to_subcells_host = PHX::View<int*>::HostMirror("cell_to_subcells_host", num_cells*num_faces_per_cell);
+
   // This line not needed since kokkos initializes the arrays above to zero
   //_subcell_to_cells_adj(0)=0;
 
@@ -99,6 +110,12 @@ setup(const panzer::LocalMeshPartition & partition)
     PHX::Device::execution_space().fence();
   }
 
+  // Copy values to host
+  Kokkos::deep_copy(_subcell_to_cells_adj_host, _subcell_to_cells_adj);
+  Kokkos::deep_copy(_subcell_to_cells_host, _subcell_to_cells);
+  Kokkos::deep_copy(_subcell_to_local_subcells_host,_subcell_to_local_subcells);
+  Kokkos::deep_copy(_cell_to_subcells_adj_host,_cell_to_subcells_adj);
+  Kokkos::deep_copy(_cell_to_subcells_host,_cell_to_subcells);
 }
 
 }
diff --git a/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.hpp b/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.hpp
index 0828a69f0ab0..5a255c3aa3a9 100644
--- a/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.hpp
+++ b/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.hpp
@@ -69,7 +69,8 @@ class SubcellConnectivity
    * \return Number of subcells associated with the cells
    */
   KOKKOS_INLINE_FUNCTION
-  int numSubcells() const {return _num_subcells;}
+  int numSubcells() const {return _num_subcells_device(0);}
+  int numSubcellsHost() const {return _num_subcells;}
 
   /**
    * \brief Gives number of cells in connectivity
@@ -77,7 +78,8 @@ class SubcellConnectivity
    * \return Number of subcells associated with the cells
    */
   KOKKOS_INLINE_FUNCTION
-  int numCells() const {return _num_cells;}
+  int numCells() const {return _num_cells_device(0);}
+  int numCellsHost() const {return _num_cells;}
 
   /**
    * \brief gives number of subcells (e.g. faces) found on a given cell
@@ -90,6 +92,8 @@ class SubcellConnectivity
    */
   KOKKOS_INLINE_FUNCTION
   int numSubcellsOnCell(const int cell) const;
+  inline
+  int numSubcellsOnCellHost(const int cell) const;
 
   /**
    * \brief Returns the number of cells attached to a given subcell
@@ -104,6 +108,8 @@ class SubcellConnectivity
    */
   KOKKOS_INLINE_FUNCTION
   int numCellsOnSubcell(const int subcell) const;
+  inline
+  int numCellsOnSubcellHost(const int subcell) const;
 
   /**
    * \brief Get the subcell index for a given cell and local subcell index
@@ -121,6 +127,8 @@ class SubcellConnectivity
    */
   KOKKOS_INLINE_FUNCTION
   int subcellForCell(const int cell, const int local_subcell_index) const;
+  inline
+  int subcellForCellHost(const int cell, const int local_subcell_index) const;
 
   /**
    * \brief Get the cell for a given subcell and a local_cell_index
@@ -138,6 +146,8 @@ class SubcellConnectivity
    */
   KOKKOS_INLINE_FUNCTION
   int cellForSubcell(const int subcell, const int local_cell_index) const;
+  inline
+  int cellForSubcellHost(const int subcell, const int local_cell_index) const;
 
   /**
    * \brief Get the local subcell index given a subcell and a local cell index
@@ -151,29 +161,38 @@ class SubcellConnectivity
    */
   KOKKOS_INLINE_FUNCTION
   int localSubcellForSubcell(const int subcell, const int local_cell_index) const;
+  inline
+  int localSubcellForSubcellHost(const int subcell, const int local_cell_index) const;
 
 protected:
 
   /// Number of subcells for a given number of cells
   int _num_subcells;
+  PHX::View<int*> _num_subcells_device;
 
   /// Number of cells
   int _num_cells;
+  PHX::View<int*> _num_cells_device;
 
   /// Adjacency array for indexing into subcell_to_cells array
   PHX::View<int*> _subcell_to_cells_adj;
+  PHX::View<int*>::HostMirror _subcell_to_cells_adj_host;
 
   /// Mapping from subcells to cells
   PHX::View<int*> _subcell_to_cells;
+  PHX::View<int*>::HostMirror _subcell_to_cells_host;
 
   /// Mapping from subcell indexes to local subcell indexes
   PHX::View<int*> _subcell_to_local_subcells;
+  PHX::View<int*>::HostMirror _subcell_to_local_subcells_host;
 
   /// Adjacency array for indexing into cell_to_subcells array
   PHX::View<int*> _cell_to_subcells_adj;
+  PHX::View<int*>::HostMirror _cell_to_subcells_adj_host;
 
   /// Mapping from cells to subcells
   PHX::View<int*> _cell_to_subcells;
+  PHX::View<int*>::HostMirror _cell_to_subcells_host;
 
 };
 
@@ -211,34 +230,48 @@ SubcellConnectivity::
 numSubcellsOnCell(const int cell) const
 {
 #ifdef PANZER_DEBUG
-#ifndef KOKKOS_ENABLE_CUDA
-  TEUCHOS_ASSERT(cell >= 0 and cell < _num_cells);
-#endif
+  KOKKOS_ASSERT(cell >= 0 and cell < _num_cells_device(0));
 #endif
   return _cell_to_subcells_adj(cell+1)-_cell_to_subcells_adj(cell);
 }
 
 int
 SubcellConnectivity::
-numCellsOnSubcell(const int subcell) const
+numSubcellsOnCellHost(const int cell) const
 {
 #ifdef PANZER_DEBUG
-#ifndef KOKKOS_ENABLE_CUDA
-  TEUCHOS_ASSERT(subcell >= 0 and subcell < _num_subcells);
+  KOKKOS_ASSERT(cell >= 0 and cell < _num_cells);
 #endif
+  return _cell_to_subcells_adj_host(cell+1)-_cell_to_subcells_adj_host(cell);
+}
+
+int
+SubcellConnectivity::
+numCellsOnSubcell(const int subcell) const
+{
+#ifdef PANZER_DEBUG
+  KOKKOS_ASSERT(subcell >= 0 and subcell < _num_subcells_device(0));
 #endif
   return _subcell_to_cells_adj(subcell+1)-_subcell_to_cells_adj(subcell);
 }
 
 int
 SubcellConnectivity::
-subcellForCell(const int cell, const int local_subcell_index) const
+numCellsOnSubcellHost(const int subcell) const
 {
 #ifdef PANZER_DEBUG
-#ifndef KOKKOS_ENABLE_CUDA
-  TEUCHOS_ASSERT(cell >= 0 and cell < _num_cells);
-  TEUCHOS_ASSERT(local_subcell_index < numSubcellsOnCell(cell));
+  KOKKOS_ASSERT(subcell >= 0 and subcell < _num_subcells);
 #endif
+  return _subcell_to_cells_adj_host(subcell+1)-_subcell_to_cells_adj_host(subcell);
+}
+
+int
+SubcellConnectivity::
+subcellForCell(const int cell, const int local_subcell_index) const
+{
+#ifdef PANZER_DEBUG
+  KOKKOS_ASSERT(cell >= 0 and cell < _num_cells_device(0));
+  KOKKOS_ASSERT(local_subcell_index < numSubcellsOnCell(cell));
 #endif
   const int index = _cell_to_subcells_adj(cell)+local_subcell_index;
   return _cell_to_subcells(index);
@@ -246,13 +279,23 @@ subcellForCell(const int cell, const int local_subcell_index) const
 
 int
 SubcellConnectivity::
-cellForSubcell(const int subcell, const int local_cell_index) const
+subcellForCellHost(const int cell, const int local_subcell_index) const
 {
 #ifdef PANZER_DEBUG
-#ifndef KOKKOS_ENABLE_CUDA
-  TEUCHOS_ASSERT(subcell >= 0 and subcell < _num_subcells);
-  TEUCHOS_ASSERT(local_cell_index < numCellsOnSubcell(subcell));
+  KOKKOS_ASSERT(cell >= 0 and cell < _num_cells);
+  KOKKOS_ASSERT(local_subcell_index < numSubcellsOnCellHost(cell));
 #endif
+  const int index = _cell_to_subcells_adj_host(cell)+local_subcell_index;
+  return _cell_to_subcells_host(index);
+}
+
+int
+SubcellConnectivity::
+cellForSubcell(const int subcell, const int local_cell_index) const
+{
+#ifdef PANZER_DEBUG
+  KOKKOS_ASSERT(subcell >= 0 and subcell < _num_subcells_device(0));
+  KOKKOS_ASSERT(local_cell_index < numCellsOnSubcell(subcell));
 #endif
   const int index = _subcell_to_cells_adj(subcell)+local_cell_index;
   return _subcell_to_cells(index);
@@ -260,18 +303,40 @@ cellForSubcell(const int subcell, const int local_cell_index) const
 
 int
 SubcellConnectivity::
-localSubcellForSubcell(const int subcell, const int local_cell_index) const
+cellForSubcellHost(const int subcell, const int local_cell_index) const
 {
 #ifdef PANZER_DEBUG
-#ifndef KOKKOS_ENABLE_CUDA
-  TEUCHOS_ASSERT(subcell >= 0 and subcell < _num_subcells);
-  TEUCHOS_ASSERT(local_cell_index < numCellsOnSubcell(subcell));
+  KOKKOS_ASSERT(subcell >= 0 and subcell < _num_subcells);
+  KOKKOS_ASSERT(local_cell_index < numCellsOnSubcellHost(subcell));
 #endif
+  const int index = _subcell_to_cells_adj_host(subcell)+local_cell_index;
+  return _subcell_to_cells_host(index);
+}
+
+int
+SubcellConnectivity::
+localSubcellForSubcell(const int subcell, const int local_cell_index) const
+{
+#ifdef PANZER_DEBUG
+  KOKKOS_ASSERT(subcell >= 0 and subcell < _num_subcells_device(0));
+  KOKKOS_ASSERT(local_cell_index < numCellsOnSubcell(subcell));
 #endif
   const int index = _subcell_to_cells_adj(subcell)+local_cell_index;
   return _subcell_to_local_subcells(index);
 }
 
+int
+SubcellConnectivity::
+localSubcellForSubcellHost(const int subcell, const int local_cell_index) const
+{
+#ifdef PANZER_DEBUG
+  KOKKOS_ASSERT(subcell >= 0 and subcell < _num_subcells);
+  KOKKOS_ASSERT(local_cell_index < numCellsOnSubcellHost(subcell));
+#endif
+  const int index = _subcell_to_cells_adj_host(subcell)+local_cell_index;
+  return _subcell_to_local_subcells_host(index);
+}
+
 } // namespace panzer
 
 #endif
diff --git a/packages/panzer/disc-fe/src/Panzer_Workset.cpp b/packages/panzer/disc-fe/src/Panzer_Workset.cpp
index c76f7af72888..4475238fa06a 100644
--- a/packages/panzer/disc-fe/src/Panzer_Workset.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_Workset.cpp
@@ -52,6 +52,7 @@
 #include "Panzer_LocalMeshInfo.hpp"
 #include "Panzer_PointGenerator.hpp"
 #include "Panzer_PointValues2.hpp"
+#include "Panzer_IntegrationValues2_impl.hpp" // needed for lambda dispatch with convertNormalToRotationMatrix()
 
 #include "Panzer_SubcellConnectivity.hpp"
 #include "Panzer_OrientationsInterface.hpp"

From bfc5b7f698c7e846f6458bdd89926215ffd8bc8e Mon Sep 17 00:00:00 2001
From: Roger Pawlowski <rppawlo@sandia.gov>
Date: Mon, 12 Jul 2021 16:15:33 -0600
Subject: [PATCH 09/14] Panzer: integration values is uvm free

While all tests pass, these changes produce a ton of cuda compiler
warnings for calling host function from device functions. It's an
issue with lambda captures triggering ctor and dtor of int value
members. Will fix next.
---
 .../disc-fe/src/Panzer_IntegrationValues2.cpp | 121 ++++++++++--------
 .../disc-fe/src/Panzer_IntegrationValues2.hpp |   1 -
 .../src/Panzer_IntegrationValues2_impl.hpp    |   4 +-
 .../test/core_tests/integration_values2.cpp   |  18 +--
 4 files changed, 81 insertions(+), 63 deletions(-)

diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
index c7004a1e16f2..c9a7abe244ee 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
@@ -471,7 +471,7 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
 
     Kokkos::MDRangePolicy<PHX::Device::execution_space,Kokkos::Rank<3>> policy({0,0,0},{num_cells,num_nodes,num_dims});
     Kokkos::parallel_for("node_coordinates",policy,KOKKOS_LAMBDA (const int cell, const int node, const int dim) {
-          node_coordinates_k(cell,node,dim) = in_node_coordinates_k(cell,node,dim);
+      node_coordinates_k(cell,node,dim) = in_node_coordinates_k(cell,node,dim);
     });
     PHX::Device::execution_space().fence();
   }
@@ -699,25 +699,33 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
 #define PANZER_DOT(a,b) (a[0]*b[0] + a[1]*b[1] + a[2]*b[2])
 #define PANZER_CROSS(a,b,c) {c[0] = a[1]*b[2] - a[2]*b[1]; c[1] = a[2]*b[0] - a[0]*b[2]; c[2] = a[0]*b[1] - a[1]*b[0];}
 
-    // Reorder index vector
-    std::vector<int> point_order(num_points_per_face);
+    // Reorder index vector. This is scratch space that needs to be
+    // allocated per thread now. Should use team size instead of num
+    // faces, but then we need to convert the lambda below to a
+    // functor to query the policy using the functor. We'll just live
+    // with the extra memory since it is temporary scratch.
+    // std::vector<int> point_order(num_points_per_face);
+    PHX::View<int**> point_order("scratch: point_order",face_connectivity.numSubcellsHost(),num_points_per_face);
 
     // Iterate through faces
-    for(int face=0; face<face_connectivity.numSubcellsHost(); ++face){
+    auto ip_coordinates_k = ip_coordinates.get_view();
+    auto surface_rotation_matrices_k = surface_rotation_matrices.get_view();
+    auto& int_values_device = *this; // for cuda
+    Kokkos::parallel_for("face iteration",face_connectivity.numSubcellsHost(),KOKKOS_LAMBDA (const int face) {
       // Cells for sides 0 and 1
       const int cell_0 = face_connectivity.cellForSubcell(face,0);
       const int cell_1 = face_connectivity.cellForSubcell(face,1);
 
       // If this face doesn't connect to anything we don't need to worry about alignment
       if(cell_1 < 0)
-        continue;
+        return;
 
       // Local face index for sides 0 and 1
       const int lidx_0 = face_connectivity.localSubcellForSubcell(face,0);
       const int lidx_1 = face_connectivity.localSubcellForSubcell(face,1);
 
       // If the cell exists, then the local face index needs to exist
-      TEUCHOS_ASSERT(lidx_1 >= 0);
+      KOKKOS_ASSERT(lidx_1 >= 0);
       
       // To compare points on planes, it is good to center the planes around the origin
       // Find the face center for the left and right cell (may not be the same point - e.g. periodic conditions)
@@ -726,14 +734,14 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
       for(int face_point=0; face_point<num_points_per_face; ++face_point){
         Scalar dx2 = 0.;
         for(int dim=0; dim<cell_dim; ++dim){
-          xc0[dim] += ip_coordinates(cell_0,lidx_0*num_points_per_face + face_point,dim);
-          xc1[dim] += ip_coordinates(cell_1,lidx_1*num_points_per_face + face_point,dim);
-          const Scalar dx = ip_coordinates(cell_0,lidx_0*num_points_per_face + face_point,dim) - ip_coordinates(cell_0,lidx_0*num_points_per_face,dim); 
+          xc0[dim] += ip_coordinates_k(cell_0,lidx_0*num_points_per_face + face_point,dim);
+          xc1[dim] += ip_coordinates_k(cell_1,lidx_1*num_points_per_face + face_point,dim);
+          const Scalar dx = ip_coordinates_k(cell_0,lidx_0*num_points_per_face + face_point,dim) - ip_coordinates_k(cell_0,lidx_0*num_points_per_face,dim); 
           dx2 += dx*dx;
         }
 
         // Check if the distance squared between these two points is larger than the others (doesn't need to be perfect)
-        r2 = std::max(r2, dx2);
+        r2 = (r2 < dx2) ? dx2 : r2;
       }
       for(int dim=0; dim<cell_dim; ++dim){
         xc0[dim] /= double(num_points_per_face);
@@ -750,15 +758,15 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
       const int example_point_1 = lidx_1*num_points_per_face;
       
       // Load the transverse and binormal for the 0 cell (default)
-      Scalar t[3] = {surface_rotation_matrices(cell_0,example_point_0,1,0), surface_rotation_matrices(cell_0,example_point_0,1,1), surface_rotation_matrices(cell_0,example_point_0,1,2)};
-      Scalar b[3] = {surface_rotation_matrices(cell_0,example_point_0,2,0), surface_rotation_matrices(cell_0,example_point_0,2,1), surface_rotation_matrices(cell_0,example_point_0,2,2)};
+      Scalar t[3] = {surface_rotation_matrices_k(cell_0,example_point_0,1,0), surface_rotation_matrices_k(cell_0,example_point_0,1,1), surface_rotation_matrices_k(cell_0,example_point_0,1,2)};
+      Scalar b[3] = {surface_rotation_matrices_k(cell_0,example_point_0,2,0), surface_rotation_matrices_k(cell_0,example_point_0,2,1), surface_rotation_matrices_k(cell_0,example_point_0,2,2)};
 
       // In case the faces are not antiparallel (e.g. periodic wedge), we may need to change the transverse and binormal
       {
       
         // Get the normals for each face for constructing one of the plane vectors (transverse)
-        const Scalar n0[3] = {surface_rotation_matrices(cell_0,example_point_0,0,0), surface_rotation_matrices(cell_0,example_point_0,0,1), surface_rotation_matrices(cell_0,example_point_0,0,2)};
-        const Scalar n1[3] = {surface_rotation_matrices(cell_1,example_point_1,0,0), surface_rotation_matrices(cell_1,example_point_1,0,1), surface_rotation_matrices(cell_1,example_point_1,0,2)};
+        const Scalar n0[3] = {surface_rotation_matrices_k(cell_0,example_point_0,0,0), surface_rotation_matrices_k(cell_0,example_point_0,0,1), surface_rotation_matrices_k(cell_0,example_point_0,0,2)};
+        const Scalar n1[3] = {surface_rotation_matrices_k(cell_1,example_point_1,0,0), surface_rotation_matrices_k(cell_1,example_point_1,0,1), surface_rotation_matrices_k(cell_1,example_point_1,0,2)};
 
         // n_0*n_1 == -1 (antiparallel), n_0*n_1 == 1 (parallel - bad), |n_0*n_1| < 1 (other)
         const Scalar n0_dot_n1 = PANZER_DOT(n0,n1);
@@ -767,7 +775,7 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
         // This causes a host of issues (e.g. identifying 180 degree periodic wedges), but we have to support virtual cells as a priority
         // Therefore, we will just assume that the ordering is fine (not valid for 180 degree periodic wedges)
         if(std::fabs(n0_dot_n1 - 1.) < 1.e-8)
-          continue;
+          return;
 
         // Rotated faces case (e.g. periodic wedge) we need to check for non-antiparallel face normals
         if(std::fabs(n0_dot_n1 + 1.) > 1.e-2){
@@ -794,7 +802,6 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
           b[2] /= mag_b;
           
         }
-        
       }
 
       // Now that we have a reference coordinate plane in which to align our points
@@ -805,7 +812,7 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
       // Differential position in mesh
       Scalar x0[3] = {0};
       Scalar x1[3] = {0};
-      
+
       // Iterate through points in cell 1 and find which point they align with in cell 0
       for(int face_point_1=0; face_point_1<num_points_per_face; ++face_point_1){
 
@@ -814,14 +821,14 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
 
         // Load position shifted by face center
         for(int dim=0; dim<cell_dim; ++dim)
-          x1[dim] = ip_coordinates(cell_1,point_1,dim) - xc1[dim];
+          x1[dim] = ip_coordinates_k(cell_1,point_1,dim) - xc1[dim];
 
         // Convert position to transverse/binormal plane
         p1[0] = PANZER_DOT(x1,t);
         p1[1] = PANZER_DOT(x1,b);
 
         // Set the default for the point order
-        point_order[face_point_1] = face_point_1;
+        point_order(face,face_point_1) = face_point_1;
         
         // Compare to points on the other surface
         for(int face_point_0=0; face_point_0<num_points_per_face; ++face_point_0){
@@ -831,7 +838,7 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
 
           // Load position shifted by face center
           for(int dim=0; dim<cell_dim; ++dim)
-            x0[dim] = ip_coordinates(cell_0,point_0,dim) - xc0[dim];
+            x0[dim] = ip_coordinates_k(cell_0,point_0,dim) - xc0[dim];
 
           // Convert position to transverse/binormal plane
           p0[0] = PANZER_DOT(x0,t);
@@ -842,28 +849,30 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
 
           // If the distance, compared to the size of the cell, is small, we assume these are the same points
           if(p012 / r2 < 1.e-12){
-            point_order[face_point_1] = face_point_0;
+            point_order(face,face_point_1) = face_point_0;
             break;
           }
 
           // Big problem - there wan't a point that aligned properly
-          TEUCHOS_ASSERT(face_point_0 != num_points_per_face-1);
+          KOKKOS_ASSERT(face_point_0 != num_points_per_face-1);
 
         }
       }
-      
+
       // Now re-order the points on face 1 to correct the alignment
       const int point_offset = lidx_1*num_points_per_face;
       for( int face_point_1 = 0; face_point_1 < num_points_per_face - 1; ++face_point_1 ){
         // While the point is not yet in place - keep swapping until it is in place (N^2 operations - not great)
-        while( face_point_1 != point_order[face_point_1] ){
+        while( face_point_1 != point_order(face,face_point_1) ){
           // We need to swap with the component in this position
-          const int face_point_0 = point_order[face_point_1];
-          swapQuadraturePoints(cell_1,point_offset+face_point_1,point_offset+face_point_0);
-          std::swap( point_order[face_point_1], point_order[face_point_0] );
+          const int face_point_0 = point_order(face,face_point_1);
+          int_values_device.swapQuadraturePoints(cell_1,point_offset+face_point_1,point_offset+face_point_0);
+          std::swap( point_order(face,face_point_1), point_order(face,face_point_0) );
         }
       }
-    }
+
+    });
+    PHX::Device::execution_space().fence();
   }
     
 #undef PANZER_DOT
@@ -874,41 +883,51 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
   // I'm not sure if these should exist for surface integrals, but here we go!
 
   // Shakib contravarient metric tensor
-  for (int cell = 0; cell < num_cells; ++cell) {
-    for (size_type ip = 0; ip < contravarient.extent(1); ++ip) {
-
+  {
+    auto contravarient_k = this->contravarient.get_static_view();
+    auto covarient_k = this->covarient.get_static_view();
+    auto jac_k = this->jac.get_static_view();
+    Kokkos::MDRangePolicy<PHX::Device::execution_space,Kokkos::Rank<2>> policy({0,0},{(int)num_cells,(int)contravarient.extent(1)});
+    Kokkos::parallel_for("covarient metric tensor",policy,KOKKOS_LAMBDA (const int cell,const int ip) {
       // zero out matrix
-      for (size_type i = 0; i < contravarient.extent(2); ++i)
-        for (size_type j = 0; j < contravarient.extent(3); ++j)
-          covarient(cell,ip,i,j) = 0.0;
+      for (size_type i = 0; i < contravarient_k.extent(2); ++i)
+        for (size_type j = 0; j < contravarient_k.extent(3); ++j)
+          covarient_k(cell,ip,i,j) = 0.0;
 
       // g^{ij} = \frac{\parital x_i}{\partial \chi_\alpha}\frac{\parital x_j}{\partial \chi_\alpha}
-      for (size_type i = 0; i < contravarient.extent(2); ++i) {
-        for (size_type j = 0; j < contravarient.extent(3); ++j) {
-          for (size_type alpha = 0; alpha < contravarient.extent(2); ++alpha) {
-            covarient(cell,ip,i,j) += jac(cell,ip,i,alpha) * jac(cell,ip,j,alpha);
+      for (size_type i = 0; i < contravarient_k.extent(2); ++i) {
+        for (size_type j = 0; j < contravarient_k.extent(3); ++j) {
+          for (size_type alpha = 0; alpha < contravarient_k.extent(2); ++alpha) {
+            covarient_k(cell,ip,i,j) += jac_k(cell,ip,i,alpha) * jac_k(cell,ip,j,alpha);
           }
         }
       }
-
-    }
+    });
+    PHX::Device::execution_space().fence();
   }
 
-  auto s_contravarient = Kokkos::subview(contravarient.get_view(), std::make_pair(0,num_cells),Kokkos::ALL,Kokkos::ALL,Kokkos::ALL);
-  auto s_covarient = Kokkos::subview(covarient.get_view(), std::make_pair(0,num_cells),Kokkos::ALL,Kokkos::ALL,Kokkos::ALL);
-  Intrepid2::RealSpaceTools<PHX::Device::execution_space>::inverse(s_contravarient, s_covarient);
+  {
+    auto s_contravarient = Kokkos::subview(contravarient.get_view(), std::make_pair(0,num_cells),Kokkos::ALL,Kokkos::ALL,Kokkos::ALL);
+    auto s_covarient = Kokkos::subview(covarient.get_view(), std::make_pair(0,num_cells),Kokkos::ALL,Kokkos::ALL,Kokkos::ALL);
+    Intrepid2::RealSpaceTools<PHX::Device::execution_space>::inverse(s_contravarient, s_covarient);
+    PHX::Device::execution_space().fence();    
+  }
 
   // norm of g_ij
-  for (int cell = 0; cell < num_cells; ++cell) {
-    for (size_type ip = 0; ip < contravarient.extent(1); ++ip) {
-      norm_contravarient(cell,ip) = 0.0;
-      for (size_type i = 0; i < contravarient.extent(2); ++i) {
-        for (size_type j = 0; j < contravarient.extent(3); ++j) {
-          norm_contravarient(cell,ip) += contravarient(cell,ip,i,j) * contravarient(cell,ip,i,j);
+  {
+    auto contravarient_k = this->contravarient.get_static_view();
+    auto norm_contravarient_k = this->norm_contravarient.get_static_view();
+    Kokkos::MDRangePolicy<PHX::Device::execution_space,Kokkos::Rank<2>> policy({0,0},{(int)num_cells,(int)contravarient.extent(1)});
+    Kokkos::parallel_for("covarient metric tensor",policy,KOKKOS_LAMBDA (const int cell,const int ip) {
+      norm_contravarient_k(cell,ip) = 0.0;
+      for (size_type i = 0; i < contravarient_k.extent(2); ++i) {
+        for (size_type j = 0; j < contravarient_k.extent(3); ++j) {
+          norm_contravarient_k(cell,ip) += contravarient_k(cell,ip,i,j) * contravarient_k(cell,ip,i,j);
         }
       }
-      norm_contravarient(cell,ip) = std::sqrt(norm_contravarient(cell,ip));
-    }
+      norm_contravarient_k(cell,ip) = std::sqrt(norm_contravarient_k(cell,ip));
+    });
+    PHX::Device::execution_space().fence();    
   }
 
 }
diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
index 178a5d45dd87..c33e8bb0f265 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
@@ -162,7 +162,6 @@ namespace panzer {
     void swapQuadraturePoints(int cell,int a,int b) const;
 
     KOKKOS_INLINE_FUNCTION
-    //void convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scalar binormal[3]);
     static void convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scalar binormal[3]);
 
     /// This should be a private method, but using lambdas on cuda forces this to be public.
diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp
index 8106f1e9b507..8098ebf52c04 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp
@@ -71,9 +71,9 @@ swapQuadraturePoints(int cell,
   // Rotation matrices are always in 3D
   for(int dim=0; dim<3; ++dim){
     for(int dim2=0; dim2<3; ++dim2){
-      scratch_for_compute_side_measure(0) = surface_rotation_matrices(cell,new_cell_point,dim,dim2);
+      hold = surface_rotation_matrices(cell,new_cell_point,dim,dim2);
       surface_rotation_matrices(cell,new_cell_point,dim,dim2) = surface_rotation_matrices(cell,old_cell_point,dim,dim2);
-      surface_rotation_matrices(cell,old_cell_point,dim,dim2) = scratch_for_compute_side_measure(0);
+      surface_rotation_matrices(cell,old_cell_point,dim,dim2) = hold;
     }
   }
 }
diff --git a/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp b/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
index 46b0ab93c325..ce944bab6a76 100644
--- a/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
+++ b/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
@@ -507,7 +507,7 @@ namespace panzer {
     auto node_coordinates = af.buildStaticArray<double,Cell,NODE,Dim>("node_coordinates",2,4,2);
     {
       auto node_coordinates_host = Kokkos::create_mirror_view(node_coordinates.get_static_view());
-      auto cell_vertices_host = Kokkos::create_mirror_view(mesh.cell_vertices);
+      auto cell_vertices_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),mesh.cell_vertices);
       for(int i=0; i<mesh.cell_vertices.extent_int(0); ++i)
         for(int j=0; j<mesh.cell_vertices.extent_int(1); ++j)
           for(int k=0; k<mesh.cell_vertices.extent_int(2); ++k)
@@ -549,11 +549,11 @@ namespace panzer {
     
     // Face 0
     {
-      const int cell_0 = connectivity->cellForSubcell(0,0);
-      const int cell_1 = connectivity->cellForSubcell(0,1);
+      const int cell_0 = connectivity->cellForSubcellHost(0,0);
+      const int cell_1 = connectivity->cellForSubcellHost(0,1);
 
-      const int lidx_0 = connectivity->localSubcellForSubcell(0,0);
-      const int lidx_1 = connectivity->localSubcellForSubcell(0,1);
+      const int lidx_0 = connectivity->localSubcellForSubcellHost(0,0);
+      const int lidx_1 = connectivity->localSubcellForSubcellHost(0,1);
 
       TEST_EQUALITY(cell_0,0);
       TEST_EQUALITY(cell_1,1);
@@ -585,11 +585,11 @@ namespace panzer {
     
     // Face 2
     {
-      const int cell_0 = connectivity->cellForSubcell(2,0);
-      const int cell_1 = connectivity->cellForSubcell(2,1);
+      const int cell_0 = connectivity->cellForSubcellHost(2,0);
+      const int cell_1 = connectivity->cellForSubcellHost(2,1);
 
-      const int lidx_0 = connectivity->localSubcellForSubcell(2,0);
-      const int lidx_1 = connectivity->localSubcellForSubcell(2,1);
+      const int lidx_0 = connectivity->localSubcellForSubcellHost(2,0);
+      const int lidx_1 = connectivity->localSubcellForSubcellHost(2,1);
 
       TEST_EQUALITY(cell_0,0);
       TEST_EQUALITY(cell_1,1);

From bc049035b298b65cf5495eb1a6e8d7003478423e Mon Sep 17 00:00:00 2001
From: Roger Pawlowski <rppawlo@sandia.gov>
Date: Mon, 12 Jul 2021 17:42:46 -0600
Subject: [PATCH 10/14] Panzer: fix int_values cuda warnings

---
 .../disc-fe/src/Panzer_IntegrationValues2.cpp | 139 ++----------------
 .../disc-fe/src/Panzer_IntegrationValues2.hpp |  37 +++--
 .../src/Panzer_IntegrationValues2_impl.hpp    |  18 ++-
 .../test/core_tests/integration_values2.cpp   |  56 ++++---
 4 files changed, 93 insertions(+), 157 deletions(-)

diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
index c9a7abe244ee..7c2b395e2dee 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
@@ -324,128 +324,6 @@ getCubature(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordinates,
                                 *(int_rule->topology));
 }
 
-  /*
-template <typename Scalar>
-void IntegrationValues2<Scalar>::
-convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scalar binormal[3])
-{
-  using T = Scalar;
-  
-  const T n  = sqrt(normal[0]*normal[0]+normal[1]*normal[1]+normal[2]*normal[2]);
-
-  // If this fails then the geometry for this cell is probably undefined
-  if(n > 0.){
-    // Make sure transverse is not parallel to normal within some margin of error
-    transverse[0]=0.;transverse[1]=1.;transverse[2]=0.;
-    if(std::fabs(normal[0]*transverse[0]+normal[1]*transverse[1])>0.9){
-      transverse[0]=1.;transverse[1]=0.;
-    }
-
-    const T nt = normal[0]*transverse[0]+normal[1]*transverse[1]+normal[2]*transverse[2];
-
-    // Note normal has unit length
-    const T mult = nt/(n*n); // = nt
-
-    // Remove normal projection from transverse
-    for(int dim=0;dim<3;++dim){
-      transverse[dim] = transverse[dim] - mult * normal[dim];
-    }
-
-    const T t = sqrt(transverse[0]*transverse[0]+transverse[1]*transverse[1]+transverse[2]*transverse[2]);
-    KOKKOS_ASSERT(t != 0.);
-    for(int dim=0;dim<3;++dim){
-      transverse[dim] /= t;
-    }
-
-    // We assume a right handed system such that b = n \times t
-    binormal[0] = (normal[1] * transverse[2] - normal[2] * transverse[1]);
-    binormal[1] = (normal[2] * transverse[0] - normal[0] * transverse[2]);
-    binormal[2] = (normal[0] * transverse[1] - normal[1] * transverse[0]);
-
-    // Normalize binormal
-    const T b = sqrt(binormal[0]*binormal[0]+binormal[1]*binormal[1]+binormal[2]*binormal[2]);
-    for(int dim=0;dim<3;++dim){
-      binormal[dim] /= b;
-    }
-  } else {
-    transverse[0] = 0.;
-    transverse[1] = 0.;
-    transverse[2] = 0.;
-    binormal[0] = 0.;
-    binormal[1] = 0.;
-    binormal[2] = 0.;
-  }
-}
-  */
-  /*  
-template <typename Scalar>
-void IntegrationValues2<Scalar>::
-swapQuadraturePoints(int cell,
-                     int a,
-                     int b) const
-{
-  const int new_cell_point = a;
-  const int old_cell_point = b;
-
-  const int cell_dim = ref_ip_coordinates.extent(2);
-
-#ifdef PANZER_DEBUG
-  KOKKOS_ASSERT(cell < ip_coordinates.extent_int(0));
-  KOKKOS_ASSERT(a < ip_coordinates.extent_int(1));
-  KOKKOS_ASSERT(b < ip_coordinates.extent_int(1));
-  KOKKOS_ASSERT(cell >= 0);
-  KOKKOS_ASSERT(a >= 0);
-  KOKKOS_ASSERT(b >= 0);
-#endif
-
-  // Using scratch_for_compute_side_measure instead of hold. This
-  // works around UVM issues of DFAD temporaries in device code.
-  // Scalar hold;
-
-  scratch_for_compute_side_measure(0) = weighted_measure(cell,new_cell_point);
-  weighted_measure(cell,new_cell_point) = weighted_measure(cell,old_cell_point);
-  weighted_measure(cell,old_cell_point) = scratch_for_compute_side_measure(0);
-
-  scratch_for_compute_side_measure(0) = jac_det(cell,new_cell_point);
-  jac_det(cell,new_cell_point) = jac_det(cell,old_cell_point);
-  jac_det(cell,old_cell_point) = scratch_for_compute_side_measure(0);
-
-  for(int dim=0;dim<cell_dim;++dim){
-
-    scratch_for_compute_side_measure(0) = ref_ip_coordinates(cell,new_cell_point,dim);
-    ref_ip_coordinates(cell,new_cell_point,dim) = ref_ip_coordinates(cell,old_cell_point,dim);
-    ref_ip_coordinates(cell,old_cell_point,dim) = scratch_for_compute_side_measure(0);
-
-    scratch_for_compute_side_measure(0) = ip_coordinates(cell,new_cell_point,dim);
-    ip_coordinates(cell,new_cell_point,dim) = ip_coordinates(cell,old_cell_point,dim);
-    ip_coordinates(cell,old_cell_point,dim) = scratch_for_compute_side_measure(0);
-
-    scratch_for_compute_side_measure(0) = surface_normals(cell,new_cell_point,dim);
-    surface_normals(cell,new_cell_point,dim) = surface_normals(cell,old_cell_point,dim);
-    surface_normals(cell,old_cell_point,dim) = scratch_for_compute_side_measure(0);
-
-    for(int dim2=0;dim2<cell_dim;++dim2){
-
-      scratch_for_compute_side_measure(0) = jac(cell,new_cell_point,dim,dim2);
-      jac(cell,new_cell_point,dim,dim2) = jac(cell,old_cell_point,dim,dim2);
-      jac(cell,old_cell_point,dim,dim2) = scratch_for_compute_side_measure(0);
-
-      scratch_for_compute_side_measure(0) = jac_inv(cell,new_cell_point,dim,dim2);
-      jac_inv(cell,new_cell_point,dim,dim2) = jac_inv(cell,old_cell_point,dim,dim2);
-      jac_inv(cell,old_cell_point,dim,dim2) = scratch_for_compute_side_measure(0);
-    }
-  }
-
-  // Rotation matrices are always in 3D
-  for(int dim=0; dim<3; ++dim){
-    for(int dim2=0; dim2<3; ++dim2){
-      scratch_for_compute_side_measure(0) = surface_rotation_matrices(cell,new_cell_point,dim,dim2);
-      surface_rotation_matrices(cell,new_cell_point,dim,dim2) = surface_rotation_matrices(cell,old_cell_point,dim,dim2);
-      surface_rotation_matrices(cell,old_cell_point,dim,dim2) = scratch_for_compute_side_measure(0);
-    }
-  }
-}
-  */
 template <typename Scalar>
 void IntegrationValues2<Scalar>::
 generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordinates,
@@ -708,9 +586,14 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
     PHX::View<int**> point_order("scratch: point_order",face_connectivity.numSubcellsHost(),num_points_per_face);
 
     // Iterate through faces
+    auto ref_ip_coordinates_k = ref_ip_coordinates.get_view();
     auto ip_coordinates_k = ip_coordinates.get_view();
+    auto weighted_measure_k = weighted_measure.get_view();
+    auto jac_k = jac.get_view();
+    auto jac_det_k = jac_det.get_view();
+    auto jac_inv_k = jac_inv.get_view();
+    auto surface_normals_k = surface_normals.get_view();
     auto surface_rotation_matrices_k = surface_rotation_matrices.get_view();
-    auto& int_values_device = *this; // for cuda
     Kokkos::parallel_for("face iteration",face_connectivity.numSubcellsHost(),KOKKOS_LAMBDA (const int face) {
       // Cells for sides 0 and 1
       const int cell_0 = face_connectivity.cellForSubcell(face,0);
@@ -866,7 +749,15 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
         while( face_point_1 != point_order(face,face_point_1) ){
           // We need to swap with the component in this position
           const int face_point_0 = point_order(face,face_point_1);
-          int_values_device.swapQuadraturePoints(cell_1,point_offset+face_point_1,point_offset+face_point_0);
+          panzer::swapQuadraturePoints<double>(cell_1,point_offset+face_point_1,point_offset+face_point_0,
+                                               ref_ip_coordinates_k,
+                                               ip_coordinates_k,
+                                               weighted_measure_k,
+                                               jac_k,
+                                               jac_det_k,
+                                               jac_inv_k,
+                                               surface_normals_k,
+                                               surface_rotation_matrices_k);
           std::swap( point_order(face,face_point_1), point_order(face,face_point_0) );
         }
       }
diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
index c33e8bb0f265..8272bb524657 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
@@ -57,6 +57,31 @@ namespace panzer {
 
   class SubcellConnectivity;
 
+  /**
+   * \brief Swap the ordering of quadrature points in a specified cell.
+   *
+   * \param[in] cell   Cell index
+   * \param[in] a      Quadrature point a
+   * \param[in] b      Quadrature point b
+   *
+   * NOTE: this should be a member function of IntegrationValues but
+   * lambda capture generates a ton of cuda compiler warnings. Making
+   * it a stand alone function fixes this.
+   */
+  template<typename Scalar,
+           typename T0,typename T1,typename T2,typename T3,
+           typename T4,typename T5,typename T6,typename T7>
+  KOKKOS_INLINE_FUNCTION
+  void swapQuadraturePoints(int cell,int a,int b,
+                            T0& ref_ip_coordinates,
+                            T1& ip_coordinates,
+                            T2& weighted_measure,
+                            T3& jac,
+                            T4& jac_det,
+                            T5& jac_inv,
+                            T6& surface_normals,
+                            T7& surface_rotation_matrices);
+
   template <typename Scalar>
   class IntegrationValues2 {
   public:
@@ -149,18 +174,6 @@ namespace panzer {
 
     Array_Point scratch_for_compute_side_measure; // <Point> size: span() == jac.span()
 
-    /**
-     * \brief Swap the ordering of quadrature points in a specified cell.
-     *
-     * \param[in] cell   Cell index
-     * \param[in] a      Quadrature point a
-     * \param[in] b      Quadrature point b
-     *
-     * NOTE: this is a non-const function but we had to make it const to run on device.
-     */
-    KOKKOS_INLINE_FUNCTION
-    void swapQuadraturePoints(int cell,int a,int b) const;
-
     KOKKOS_INLINE_FUNCTION
     static void convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scalar binormal[3]);
 
diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp
index 8098ebf52c04..86ecb5eeabd6 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp
@@ -10,12 +10,22 @@
    testing).
  */
 namespace panzer {
-  
-template <typename Scalar>
-void IntegrationValues2<Scalar>::
+
+template<typename Scalar,
+         typename T0,typename T1,typename T2,typename T3,
+         typename T4,typename T5,typename T6,typename T7>
+void
 swapQuadraturePoints(int cell,
                      int a,
-                     int b) const
+                     int b,
+                     T0& ref_ip_coordinates,
+                     T1& ip_coordinates,
+                     T2& weighted_measure,
+                     T3& jac,
+                     T4& jac_det,
+                     T5& jac_inv,
+                     T6& surface_normals,
+                     T7& surface_rotation_matrices)
 {
   const int new_cell_point = a;
   const int old_cell_point = b;
diff --git a/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp b/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
index ce944bab6a76..f50ef6961b8f 100644
--- a/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
+++ b/packages/panzer/disc-fe/test/core_tests/integration_values2.cpp
@@ -262,21 +262,6 @@ namespace panzer {
 
   }
 
-  // This should be a simple lambda but the RCPs and std::vector in
-  // the int_values triggers warnings for calling host code form a
-  // host/device function. We eliminate the warnings by using a
-  // functor that is memcopied to device.
-  template<typename Scalar>
-  struct DoSwapOnDevice {
-    int cell_,orig_pt_,new_pt_;
-    panzer::IntegrationValues2<Scalar> int_values_;
-    DoSwapOnDevice(int cell,int orig_pt,int new_pt,panzer::IntegrationValues2<Scalar>& int_values)
-      : cell_(cell),orig_pt_(orig_pt),new_pt_(new_pt),int_values_(int_values){}
-    KOKKOS_INLINE_FUNCTION void operator() (const int ) const {
-      int_values_.swapQuadraturePoints(cell_,orig_pt_,new_pt_);
-    }
-  };
-
   TEUCHOS_UNIT_TEST(integration_values, quadpt_swap)
   {    
     Teuchos::RCP<shards::CellTopology> topo = 
@@ -350,7 +335,25 @@ namespace panzer {
       int org_pt = 0;
       int new_pt = 1;
 
-      Kokkos::parallel_for("swap qp",1,DoSwapOnDevice<double>(tst_cell,org_pt,new_pt,int_values));
+      auto ref_ip_coordinates_k = int_values.ref_ip_coordinates.get_view();
+      auto ip_coordinates_k = int_values.ip_coordinates.get_view();
+      auto weighted_measure_k = int_values.weighted_measure.get_view();
+      auto jac_k = int_values.jac.get_view();
+      auto jac_det_k = int_values.jac_det.get_view();
+      auto jac_inv_k = int_values.jac_inv.get_view();
+      auto surface_normals_k = int_values.surface_normals.get_view();
+      auto surface_rotation_matrices_k = int_values.surface_rotation_matrices.get_view();
+      Kokkos::parallel_for("swap qp",1,KOKKOS_LAMBDA (const int ) {
+        panzer::swapQuadraturePoints<double>(tst_cell,org_pt,new_pt,
+                                             ref_ip_coordinates_k,
+                                             ip_coordinates_k,
+                                             weighted_measure_k,
+                                             jac_k,
+                                             jac_det_k,
+                                             jac_inv_k,
+                                             surface_normals_k,
+                                             surface_rotation_matrices_k);
+      });
 
       Kokkos::deep_copy(weighted_measure,int_values.weighted_measure.get_view());
       Kokkos::deep_copy(jac_det,int_values.jac_det.get_view());
@@ -380,7 +383,26 @@ namespace panzer {
       int org_pt = 1;
       int new_pt = 3;
 
-      Kokkos::parallel_for("swap qp",1,DoSwapOnDevice<double>(tst_cell,org_pt,new_pt,int_values));
+      auto ref_ip_coordinates_k = int_values.ref_ip_coordinates.get_view();
+      auto ip_coordinates_k = int_values.ip_coordinates.get_view();
+      auto weighted_measure_k = int_values.weighted_measure.get_view();
+      auto jac_k = int_values.jac.get_view();
+      auto jac_det_k = int_values.jac_det.get_view();
+      auto jac_inv_k = int_values.jac_inv.get_view();
+      auto surface_normals_k = int_values.surface_normals.get_view();
+      auto surface_rotation_matrices_k = int_values.surface_rotation_matrices.get_view();
+      // Kokkos::parallel_for("swap qp",1,DoSwapOnDevice<double>(tst_cell,org_pt,new_pt,int_values));
+      Kokkos::parallel_for("swap qp",1,KOKKOS_LAMBDA (const int ) {
+        panzer::swapQuadraturePoints<double>(tst_cell,org_pt,new_pt,
+                                             ref_ip_coordinates_k,
+                                             ip_coordinates_k,
+                                             weighted_measure_k,
+                                             jac_k,
+                                             jac_det_k,
+                                             jac_inv_k,
+                                             surface_normals_k,
+                                             surface_rotation_matrices_k);
+      });
 
       Kokkos::deep_copy(weighted_measure,int_values.weighted_measure.get_view());
       Kokkos::deep_copy(jac_det,int_values.jac_det.get_view());

From b83d1042b6a01fa94e187f590786e39fef78632c Mon Sep 17 00:00:00 2001
From: Roger Pawlowski <rppawlo@sandia.gov>
Date: Wed, 14 Jul 2021 08:45:22 -0600
Subject: [PATCH 11/14] Panzer: move convertNormalToRotationMatrix out of
 IntValues

---
 .../Panzer_ConvertNormalToRotationMatrix.hpp  | 61 +++++++++++++++++++
 .../disc-fe/src/Panzer_IntegrationValues2.cpp |  3 +-
 .../disc-fe/src/Panzer_IntegrationValues2.hpp |  3 -
 .../src/Panzer_IntegrationValues2_impl.hpp    | 52 ----------------
 .../panzer/disc-fe/src/Panzer_Workset.cpp     |  4 +-
 5 files changed, 65 insertions(+), 58 deletions(-)
 create mode 100644 packages/panzer/disc-fe/src/Panzer_ConvertNormalToRotationMatrix.hpp

diff --git a/packages/panzer/disc-fe/src/Panzer_ConvertNormalToRotationMatrix.hpp b/packages/panzer/disc-fe/src/Panzer_ConvertNormalToRotationMatrix.hpp
new file mode 100644
index 000000000000..ed92d3f7430d
--- /dev/null
+++ b/packages/panzer/disc-fe/src/Panzer_ConvertNormalToRotationMatrix.hpp
@@ -0,0 +1,61 @@
+#ifndef PANZER_CONVERT_NORMAL_TO_ROTATION_MATRIX_HPP
+#define PANZER_CONVERT_NORMAL_TO_ROTATION_MATRIX_HPP
+
+namespace panzer {
+
+template <typename Scalar>
+KOKKOS_INLINE_FUNCTION
+void
+convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scalar binormal[3])
+{
+  using T = Scalar;
+  
+  const T n  = sqrt(normal[0]*normal[0]+normal[1]*normal[1]+normal[2]*normal[2]);
+
+  // If this fails then the geometry for this cell is probably undefined
+  if(n > 0.){
+    // Make sure transverse is not parallel to normal within some margin of error
+    transverse[0]=0.;transverse[1]=1.;transverse[2]=0.;
+    if(std::fabs(normal[0]*transverse[0]+normal[1]*transverse[1])>0.9){
+      transverse[0]=1.;transverse[1]=0.;
+    }
+
+    const T nt = normal[0]*transverse[0]+normal[1]*transverse[1]+normal[2]*transverse[2];
+
+    // Note normal has unit length
+    const T mult = nt/(n*n); // = nt
+
+    // Remove normal projection from transverse
+    for(int dim=0;dim<3;++dim){
+      transverse[dim] = transverse[dim] - mult * normal[dim];
+    }
+
+    const T t = sqrt(transverse[0]*transverse[0]+transverse[1]*transverse[1]+transverse[2]*transverse[2]);
+    KOKKOS_ASSERT(t != 0.);
+    for(int dim=0;dim<3;++dim){
+      transverse[dim] /= t;
+    }
+
+    // We assume a right handed system such that b = n \times t
+    binormal[0] = (normal[1] * transverse[2] - normal[2] * transverse[1]);
+    binormal[1] = (normal[2] * transverse[0] - normal[0] * transverse[2]);
+    binormal[2] = (normal[0] * transverse[1] - normal[1] * transverse[0]);
+
+    // Normalize binormal
+    const T b = sqrt(binormal[0]*binormal[0]+binormal[1]*binormal[1]+binormal[2]*binormal[2]);
+    for(int dim=0;dim<3;++dim){
+      binormal[dim] /= b;
+    }
+  } else {
+    transverse[0] = 0.;
+    transverse[1] = 0.;
+    transverse[2] = 0.;
+    binormal[0] = 0.;
+    binormal[1] = 0.;
+    binormal[2] = 0.;
+  }
+}
+
+}
+
+#endif
diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
index 7c2b395e2dee..a2d606e0e36f 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
@@ -57,6 +57,7 @@
 #include "Panzer_CommonArrayFactories.hpp"
 #include "Panzer_Traits.hpp"
 #include "Panzer_SubcellConnectivity.hpp"
+#include "Panzer_ConvertNormalToRotationMatrix.hpp"
 
 // For device member functions to avoid CUDA RDC in unit testing
 #include "Panzer_IntegrationValues2_impl.hpp"
@@ -548,7 +549,7 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
       
       Scalar transverse[3];
       Scalar binormal[3];
-      convertNormalToRotationMatrix(normal,transverse,binormal);
+      panzer::convertNormalToRotationMatrix(normal,transverse,binormal);
       
       for(int dim=0; dim<3; ++dim){
         surface_rotation_matrices_k(cell,point,0,dim) = normal[dim];
diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
index 8272bb524657..bab023aa6e54 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.hpp
@@ -174,9 +174,6 @@ namespace panzer {
 
     Array_Point scratch_for_compute_side_measure; // <Point> size: span() == jac.span()
 
-    KOKKOS_INLINE_FUNCTION
-    static void convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scalar binormal[3]);
-
     /// This should be a private method, but using lambdas on cuda forces this to be public.
     void evaluateRemainingValues(const PHX::MDField<Scalar,Cell,NODE,Dim> & in_node_coordinates, const int in_num_cells);
 
diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp
index 86ecb5eeabd6..2af2de2e8381 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2_impl.hpp
@@ -88,58 +88,6 @@ swapQuadraturePoints(int cell,
   }
 }
 
-template <typename Scalar>
-void IntegrationValues2<Scalar>::
-convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scalar binormal[3])
-{
-  using T = Scalar;
-  
-  const T n  = sqrt(normal[0]*normal[0]+normal[1]*normal[1]+normal[2]*normal[2]);
-
-  // If this fails then the geometry for this cell is probably undefined
-  if(n > 0.){
-    // Make sure transverse is not parallel to normal within some margin of error
-    transverse[0]=0.;transverse[1]=1.;transverse[2]=0.;
-    if(std::fabs(normal[0]*transverse[0]+normal[1]*transverse[1])>0.9){
-      transverse[0]=1.;transverse[1]=0.;
-    }
-
-    const T nt = normal[0]*transverse[0]+normal[1]*transverse[1]+normal[2]*transverse[2];
-
-    // Note normal has unit length
-    const T mult = nt/(n*n); // = nt
-
-    // Remove normal projection from transverse
-    for(int dim=0;dim<3;++dim){
-      transverse[dim] = transverse[dim] - mult * normal[dim];
-    }
-
-    const T t = sqrt(transverse[0]*transverse[0]+transverse[1]*transverse[1]+transverse[2]*transverse[2]);
-    KOKKOS_ASSERT(t != 0.);
-    for(int dim=0;dim<3;++dim){
-      transverse[dim] /= t;
-    }
-
-    // We assume a right handed system such that b = n \times t
-    binormal[0] = (normal[1] * transverse[2] - normal[2] * transverse[1]);
-    binormal[1] = (normal[2] * transverse[0] - normal[0] * transverse[2]);
-    binormal[2] = (normal[0] * transverse[1] - normal[1] * transverse[0]);
-
-    // Normalize binormal
-    const T b = sqrt(binormal[0]*binormal[0]+binormal[1]*binormal[1]+binormal[2]*binormal[2]);
-    for(int dim=0;dim<3;++dim){
-      binormal[dim] /= b;
-    }
-  } else {
-    transverse[0] = 0.;
-    transverse[1] = 0.;
-    transverse[2] = 0.;
-    binormal[0] = 0.;
-    binormal[1] = 0.;
-    binormal[2] = 0.;
-  }
-}
-
 }
 
 #endif
diff --git a/packages/panzer/disc-fe/src/Panzer_Workset.cpp b/packages/panzer/disc-fe/src/Panzer_Workset.cpp
index 4475238fa06a..52355178b38e 100644
--- a/packages/panzer/disc-fe/src/Panzer_Workset.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_Workset.cpp
@@ -52,7 +52,7 @@
 #include "Panzer_LocalMeshInfo.hpp"
 #include "Panzer_PointGenerator.hpp"
 #include "Panzer_PointValues2.hpp"
-#include "Panzer_IntegrationValues2_impl.hpp" // needed for lambda dispatch with convertNormalToRotationMatrix()
+#include "Panzer_ConvertNormalToRotationMatrix.hpp"
 
 #include "Panzer_SubcellConnectivity.hpp"
 #include "Panzer_OrientationsInterface.hpp"
@@ -148,7 +148,7 @@ correctVirtualNormals(const Teuchos::RCP<panzer::IntegrationValues2<double> > iv
           normal[d] = -n_d;
         }
 
-        panzer::IntegrationValues2<double>::convertNormalToRotationMatrix(normal,transverse,binormal);
+        panzer::convertNormalToRotationMatrix(normal,transverse,binormal);
 
         for(int dim=0; dim<3; ++dim){
           iv->surface_rotation_matrices(virtual_cell,virtual_cell_point,0,dim) = normal[dim];

From f9ca3d48d2610d8d4bf30ee7fe8d0b38b3d6a012 Mon Sep 17 00:00:00 2001
From: Roger Pawlowski <rppawlo@sandia.gov>
Date: Wed, 14 Jul 2021 12:57:52 -0600
Subject: [PATCH 12/14] Panzer: switch to use Kokkos version of math functions

---
 .../Panzer_ConvertNormalToRotationMatrix.hpp  |  2 +-
 .../disc-fe/src/Panzer_IntegrationValues2.cpp | 24 ++++++++++++-------
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/packages/panzer/disc-fe/src/Panzer_ConvertNormalToRotationMatrix.hpp b/packages/panzer/disc-fe/src/Panzer_ConvertNormalToRotationMatrix.hpp
index ed92d3f7430d..6e4454f383b8 100644
--- a/packages/panzer/disc-fe/src/Panzer_ConvertNormalToRotationMatrix.hpp
+++ b/packages/panzer/disc-fe/src/Panzer_ConvertNormalToRotationMatrix.hpp
@@ -16,7 +16,7 @@ convertNormalToRotationMatrix(const Scalar normal[3], Scalar transverse[3], Scal
   if(n > 0.){
     // Make sure transverse is not parallel to normal within some margin of error
     transverse[0]=0.;transverse[1]=1.;transverse[2]=0.;
-    if(std::fabs(normal[0]*transverse[0]+normal[1]*transverse[1])>0.9){
+    if(Kokkos::Experimental::fabs(normal[0]*transverse[0]+normal[1]*transverse[1])>0.9){
       transverse[0]=1.;transverse[1]=0.;
     }
 
diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
index a2d606e0e36f..367e5ec6c2c5 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
@@ -491,7 +491,7 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
           }
           // If n is zero then this is - hopefully - a virtual cell
           if(n > 0.){
-            n = std::sqrt(n);
+            n = Kokkos::Experimental::sqrt(n);
             for(int dim=0;dim<cell_dim;++dim){
               side_normals(cell,point,dim) /= n;
             }
@@ -658,18 +658,18 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
         // FIXME: Currently virtual cells will set their surface normal along the same direction as the cell they "reflect"
         // This causes a host of issues (e.g. identifying 180 degree periodic wedges), but we have to support virtual cells as a priority
         // Therefore, we will just assume that the ordering is fine (not valid for 180 degree periodic wedges)
-        if(std::fabs(n0_dot_n1 - 1.) < 1.e-8)
+        if(Kokkos::Experimental::fabs(n0_dot_n1 - 1.) < 1.e-8)
           return;
 
         // Rotated faces case (e.g. periodic wedge) we need to check for non-antiparallel face normals
-        if(std::fabs(n0_dot_n1 + 1.) > 1.e-2){
+        if(Kokkos::Experimental::fabs(n0_dot_n1 + 1.) > 1.e-2){
 
           // Now we need to define an arbitrary transverse and binormal in the plane across which the faces are anti-symmetric
           // We can do this by setting t = n_0 \times n_1
           PANZER_CROSS(n0,n1,t);
 
           // Normalize the transverse vector
-          const Scalar mag_t = std::sqrt(PANZER_DOT(t,t));
+          const Scalar mag_t = Kokkos::Experimental::sqrt(PANZER_DOT(t,t));
           t[0] /= mag_t;
           t[1] /= mag_t;
           t[2] /= mag_t;
@@ -680,7 +680,7 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
           b[2] = n0[2] + n1[2];
 
           // Normalize the binormal vector
-          const Scalar mag_b = std::sqrt(PANZER_DOT(b,b));
+          const Scalar mag_b = Kokkos::Experimental::sqrt(PANZER_DOT(b,b));
           b[0] /= mag_b;
           b[1] /= mag_b;
           b[2] /= mag_b;
@@ -759,7 +759,9 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
                                                jac_inv_k,
                                                surface_normals_k,
                                                surface_rotation_matrices_k);
-          std::swap( point_order(face,face_point_1), point_order(face,face_point_0) );
+          Scalar tmp = point_order(face,face_point_1);
+          point_order(face,face_point_1) = point_order(face,face_point_0);
+          point_order(face,face_point_0) = tmp;
         }
       }
 
@@ -817,7 +819,7 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
           norm_contravarient_k(cell,ip) += contravarient_k(cell,ip,i,j) * contravarient_k(cell,ip,i,j);
         }
       }
-      norm_contravarient_k(cell,ip) = std::sqrt(norm_contravarient_k(cell,ip));
+      norm_contravarient_k(cell,ip) = Kokkos::Experimental::sqrt(norm_contravarient_k(cell,ip));
     });
     PHX::Device::execution_space().fence();    
   }
@@ -863,6 +865,7 @@ evaluateRemainingValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordi
                          cub_points.get_view(),
                          node_coordinates.get_view(),
                          *(int_rule->topology));
+  PHX::Device::execution_space().fence();
 
   auto s_jac_inv = Kokkos::subview(jac_inv.get_view(),std::make_pair(0,num_cells),Kokkos::ALL(),Kokkos::ALL(),Kokkos::ALL());
   cell_tools.setJacobianInv(s_jac_inv, s_jac);
@@ -870,6 +873,8 @@ evaluateRemainingValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordi
   auto s_jac_det = Kokkos::subview(jac_det.get_view(),std::make_pair(0,num_cells),Kokkos::ALL());
   cell_tools.setJacobianDet(s_jac_det, s_jac);
 
+  PHX::Device::execution_space().fence();
+
   auto s_weighted_measure = Kokkos::subview(weighted_measure.get_view(),std::make_pair(0,num_cells),Kokkos::ALL());
   if (!int_rule->isSide()) {
     Intrepid2::FunctionSpaceTools<PHX::Device::execution_space>::
@@ -889,6 +894,8 @@ evaluateRemainingValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordi
   }
   else TEUCHOS_ASSERT(false);
 
+  PHX::Device::execution_space().fence();
+
   // Shakib contravarient metric tensor
   {
     auto covarient_k = covarient.get_view();
@@ -916,6 +923,7 @@ evaluateRemainingValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordi
   auto s_covarient = Kokkos::subview(covarient.get_view(),std::make_pair(0,num_cells),Kokkos::ALL(),Kokkos::ALL(),Kokkos::ALL());
   auto s_contravarient = Kokkos::subview(contravarient.get_view(),std::make_pair(0,num_cells),Kokkos::ALL(),Kokkos::ALL(),Kokkos::ALL());
   Intrepid2::RealSpaceTools<PHX::Device::execution_space>::inverse(s_contravarient, s_covarient);
+  PHX::Device::execution_space().fence();
 
   // norm of g_ij
   {
@@ -929,7 +937,7 @@ evaluateRemainingValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordi
           norm_contravarient_k(cell,ip) += contravarient_k(cell,ip,i,j) * contravarient_k(cell,ip,i,j);
         }
       }
-      norm_contravarient_k(cell,ip) = std::sqrt(norm_contravarient_k(cell,ip));
+      norm_contravarient_k(cell,ip) = Kokkos::Experimental::sqrt(norm_contravarient_k(cell,ip));
     });
     PHX::Device::execution_space().fence();
   }

From 74066e32b7fc812d31539eac26a593a20af315c7 Mon Sep 17 00:00:00 2001
From: Roger Pawlowski <rppawlo@sandia.gov>
Date: Wed, 14 Jul 2021 15:45:32 -0600
Subject: [PATCH 13/14] Panzer: changes to subcell size functions for uvm
 removal

---
 .../disc-fe/src/Panzer_IntegrationValues2.cpp |  4 +--
 .../src/Panzer_SubcellConnectivity.cpp        |  7 ----
 .../src/Panzer_SubcellConnectivity.hpp        | 36 +++++++------------
 3 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
index 367e5ec6c2c5..8ab12f96c901 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
@@ -584,7 +584,7 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
     // functor to query the policy using the functor. We'll just live
     // with the extra memory since it is temporary scratch.
     // std::vector<int> point_order(num_points_per_face);
-    PHX::View<int**> point_order("scratch: point_order",face_connectivity.numSubcellsHost(),num_points_per_face);
+    PHX::View<int**> point_order("scratch: point_order",face_connectivity.numSubcells(),num_points_per_face);
 
     // Iterate through faces
     auto ref_ip_coordinates_k = ref_ip_coordinates.get_view();
@@ -595,7 +595,7 @@ generateSurfaceCubatureValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_
     auto jac_inv_k = jac_inv.get_view();
     auto surface_normals_k = surface_normals.get_view();
     auto surface_rotation_matrices_k = surface_rotation_matrices.get_view();
-    Kokkos::parallel_for("face iteration",face_connectivity.numSubcellsHost(),KOKKOS_LAMBDA (const int face) {
+    Kokkos::parallel_for("face iteration",face_connectivity.numSubcells(),KOKKOS_LAMBDA (const int face) {
       // Cells for sides 0 and 1
       const int cell_0 = face_connectivity.cellForSubcell(face,0);
       const int cell_1 = face_connectivity.cellForSubcell(face,1);
diff --git a/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.cpp b/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.cpp
index 34fc08e5c38b..160d2b1c3958 100644
--- a/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.cpp
@@ -55,13 +55,6 @@ setup(const panzer::LocalMeshPartition & partition)
   const int num_faces_per_cell = partition.cell_to_faces.extent(1);
   const int num_cells_per_face = 2;
 
-  _num_subcells = num_faces;
-  _num_subcells_device = PHX::View<int*>("_num_subcells_device",1);
-  Kokkos::deep_copy(_num_subcells_device,_num_subcells);
-  _num_cells = num_cells;
-  _num_cells_device = PHX::View<int*>("_num_cells_device",1);
-  Kokkos::deep_copy(_num_cells_device,_num_cells);
-
   _subcell_to_cells_adj = PHX::View<int*>("subcell_to_cells_adj", num_faces+1);
   _subcell_to_cells = PHX::View<int*>("subcell_to_cells", num_faces*num_cells_per_face);
   _subcell_to_local_subcells = PHX::View<int*>("subcell_to_local_subcells", num_faces*num_cells_per_face);
diff --git a/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.hpp b/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.hpp
index 5a255c3aa3a9..b6c639c1336d 100644
--- a/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.hpp
+++ b/packages/panzer/disc-fe/src/Panzer_SubcellConnectivity.hpp
@@ -58,7 +58,7 @@ class SubcellConnectivity
 public:
 
   /// Default constructor
-  SubcellConnectivity():_num_subcells(-1),_num_cells(-1){}
+  SubcellConnectivity() = default;
 
   /// Default destructor
   ~SubcellConnectivity() = default;
@@ -69,8 +69,7 @@ class SubcellConnectivity
    * \return Number of subcells associated with the cells
    */
   KOKKOS_INLINE_FUNCTION
-  int numSubcells() const {return _num_subcells_device(0);}
-  int numSubcellsHost() const {return _num_subcells;}
+  int numSubcells() const {return _subcell_to_cells_adj.extent(0)-1;}
 
   /**
    * \brief Gives number of cells in connectivity
@@ -78,8 +77,7 @@ class SubcellConnectivity
    * \return Number of subcells associated with the cells
    */
   KOKKOS_INLINE_FUNCTION
-  int numCells() const {return _num_cells_device(0);}
-  int numCellsHost() const {return _num_cells;}
+  int numCells() const {return _cell_to_subcells_adj.extent(0)-1;}
 
   /**
    * \brief gives number of subcells (e.g. faces) found on a given cell
@@ -166,14 +164,6 @@ class SubcellConnectivity
 
 protected:
 
-  /// Number of subcells for a given number of cells
-  int _num_subcells;
-  PHX::View<int*> _num_subcells_device;
-
-  /// Number of cells
-  int _num_cells;
-  PHX::View<int*> _num_cells_device;
-
   /// Adjacency array for indexing into subcell_to_cells array
   PHX::View<int*> _subcell_to_cells_adj;
   PHX::View<int*>::HostMirror _subcell_to_cells_adj_host;
@@ -230,7 +220,7 @@ SubcellConnectivity::
 numSubcellsOnCell(const int cell) const
 {
 #ifdef PANZER_DEBUG
-  KOKKOS_ASSERT(cell >= 0 and cell < _num_cells_device(0));
+  KOKKOS_ASSERT(cell >= 0 and cell < numCells());
 #endif
   return _cell_to_subcells_adj(cell+1)-_cell_to_subcells_adj(cell);
 }
@@ -240,7 +230,7 @@ SubcellConnectivity::
 numSubcellsOnCellHost(const int cell) const
 {
 #ifdef PANZER_DEBUG
-  KOKKOS_ASSERT(cell >= 0 and cell < _num_cells);
+  KOKKOS_ASSERT(cell >= 0 and cell < numCells());
 #endif
   return _cell_to_subcells_adj_host(cell+1)-_cell_to_subcells_adj_host(cell);
 }
@@ -250,7 +240,7 @@ SubcellConnectivity::
 numCellsOnSubcell(const int subcell) const
 {
 #ifdef PANZER_DEBUG
-  KOKKOS_ASSERT(subcell >= 0 and subcell < _num_subcells_device(0));
+  KOKKOS_ASSERT(subcell >= 0 and subcell < numSubcells());
 #endif
   return _subcell_to_cells_adj(subcell+1)-_subcell_to_cells_adj(subcell);
 }
@@ -260,7 +250,7 @@ SubcellConnectivity::
 numCellsOnSubcellHost(const int subcell) const
 {
 #ifdef PANZER_DEBUG
-  KOKKOS_ASSERT(subcell >= 0 and subcell < _num_subcells);
+  KOKKOS_ASSERT(subcell >= 0 and subcell < numSubcells());
 #endif
   return _subcell_to_cells_adj_host(subcell+1)-_subcell_to_cells_adj_host(subcell);
 }
@@ -270,7 +260,7 @@ SubcellConnectivity::
 subcellForCell(const int cell, const int local_subcell_index) const
 {
 #ifdef PANZER_DEBUG
-  KOKKOS_ASSERT(cell >= 0 and cell < _num_cells_device(0));
+  KOKKOS_ASSERT(cell >= 0 and cell < numCells());
   KOKKOS_ASSERT(local_subcell_index < numSubcellsOnCell(cell));
 #endif
   const int index = _cell_to_subcells_adj(cell)+local_subcell_index;
@@ -282,7 +272,7 @@ SubcellConnectivity::
 subcellForCellHost(const int cell, const int local_subcell_index) const
 {
 #ifdef PANZER_DEBUG
-  KOKKOS_ASSERT(cell >= 0 and cell < _num_cells);
+  KOKKOS_ASSERT(cell >= 0 and cell < numCell());
   KOKKOS_ASSERT(local_subcell_index < numSubcellsOnCellHost(cell));
 #endif
   const int index = _cell_to_subcells_adj_host(cell)+local_subcell_index;
@@ -294,7 +284,7 @@ SubcellConnectivity::
 cellForSubcell(const int subcell, const int local_cell_index) const
 {
 #ifdef PANZER_DEBUG
-  KOKKOS_ASSERT(subcell >= 0 and subcell < _num_subcells_device(0));
+  KOKKOS_ASSERT(subcell >= 0 and subcell < numSubcells());
   KOKKOS_ASSERT(local_cell_index < numCellsOnSubcell(subcell));
 #endif
   const int index = _subcell_to_cells_adj(subcell)+local_cell_index;
@@ -306,7 +296,7 @@ SubcellConnectivity::
 cellForSubcellHost(const int subcell, const int local_cell_index) const
 {
 #ifdef PANZER_DEBUG
-  KOKKOS_ASSERT(subcell >= 0 and subcell < _num_subcells);
+  KOKKOS_ASSERT(subcell >= 0 and subcell < numSubcells());
   KOKKOS_ASSERT(local_cell_index < numCellsOnSubcellHost(subcell));
 #endif
   const int index = _subcell_to_cells_adj_host(subcell)+local_cell_index;
@@ -318,7 +308,7 @@ SubcellConnectivity::
 localSubcellForSubcell(const int subcell, const int local_cell_index) const
 {
 #ifdef PANZER_DEBUG
-  KOKKOS_ASSERT(subcell >= 0 and subcell < _num_subcells_device(0));
+  KOKKOS_ASSERT(subcell >= 0 and subcell < numSubcells());
   KOKKOS_ASSERT(local_cell_index < numCellsOnSubcell(subcell));
 #endif
   const int index = _subcell_to_cells_adj(subcell)+local_cell_index;
@@ -330,7 +320,7 @@ SubcellConnectivity::
 localSubcellForSubcellHost(const int subcell, const int local_cell_index) const
 {
 #ifdef PANZER_DEBUG
-  KOKKOS_ASSERT(subcell >= 0 and subcell < _num_subcells);
+  KOKKOS_ASSERT(subcell >= 0 and subcell < numSubcells());
   KOKKOS_ASSERT(local_cell_index < numCellsOnSubcellHost(subcell));
 #endif
   const int index = _subcell_to_cells_adj_host(subcell)+local_cell_index;

From 9d89a89bd96bf631f750b5b06f4c3b3122e5718a Mon Sep 17 00:00:00 2001
From: Roger Pawlowski <rppawlo@sandia.gov>
Date: Thu, 15 Jul 2021 07:03:56 -0600
Subject: [PATCH 14/14] Panzer: fix clang/gcc compiler warning

---
 packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
index 8ab12f96c901..da55b2e18b2d 100644
--- a/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
+++ b/packages/panzer/disc-fe/src/Panzer_IntegrationValues2.cpp
@@ -852,7 +852,7 @@ evaluateRemainingValues(const PHX::MDField<Scalar,Cell,NODE,Dim>& in_node_coordi
     auto node_coordinates_k = node_coordinates.get_view();
     auto in_node_coordinates_k = in_node_coordinates.get_view();
 
-    Kokkos::MDRangePolicy<PHX::Device,Kokkos::Rank<3>> policy({0,0,0},{num_cells,num_nodes,num_dims});
+    Kokkos::MDRangePolicy<PHX::Device,Kokkos::Rank<3>> policy({0,0,0},{(int)num_cells,(int)num_nodes,(int)num_dims});
     Kokkos::parallel_for("node coordinates",policy,KOKKOS_LAMBDA (const int cell,const int node,const int dim) {
       node_coordinates_k(cell,node,dim) = in_node_coordinates_k(cell,node,dim);
     });