From b55ec8cdcd8e42b7b3f1d6978d740140f5fb7c06 Mon Sep 17 00:00:00 2001
From: "K. Devine" <kddevin@sandia.gov>
Date: Wed, 15 Apr 2020 12:57:00 -0600
Subject: [PATCH 1/8] tpetra:  Adding a test that doesn't use row-based matrix
 distribution

---
 .../tpetra/core/test/CrsMatrix/CMakeLists.txt |   9 +
 .../test/CrsMatrix/CrsMatrix_2DRandomDist.cpp | 367 ++++++++++++++++++
 2 files changed, 376 insertions(+)
 create mode 100644 packages/tpetra/core/test/CrsMatrix/CrsMatrix_2DRandomDist.cpp

diff --git a/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt b/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt
index 8f3ff7d3fa4a..1b9bad78415b 100644
--- a/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt
+++ b/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt
@@ -88,6 +88,15 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   STANDARD_PASS_OUTPUT
   )
 
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  CrsMatrix_2DRandomDist
+  SOURCES
+    CrsMatrix_2DRandomDist.cpp
+  COMM serial mpi
+  PASS_REGULAR_EXPRESSION "PASS"
+  FAIL_REGULAR_EXPRESSION "FAIL"
+  )
+
 # We split the CrsMatrix_WithGraph test by execution space.
 # This speeds up the build.
 
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_2DRandomDist.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_2DRandomDist.cpp
new file mode 100644
index 000000000000..f966fd938d16
--- /dev/null
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_2DRandomDist.cpp
@@ -0,0 +1,367 @@
+/*
+// @HEADER
+// ***********************************************************************
+//
+//          Tpetra: Templated Linear Algebra Services Package
+//                 Copyright (2008) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ************************************************************************
+// @HEADER
+*/
+
+// This program tests matrix creation and matrix apply using matrices with
+// arbitrarily distributed nonzeros (not necessarily row-based distribution).
+//
+// Create global matrix nonzeros randomly; store all global nonzeros on 
+// each proc in a std::map.
+// Create distributed vectors with randomized entries using Trilinos' default
+// maps 
+// For each test (linear row-wise distribution, linear column-wise distribution,
+//                random distribution of nonzeros (2D) to processors)
+//    distribute matrix nonzeros (each proc selects subset of global nonzeros)
+//    create distributed CrsMatrix
+//    perform SpMV (nMatvec SpMVs)
+//    return result of SpMV
+// Compare norms of results of SpMV from all distributions; they should be the
+// same.
+//
+// NOTE:  timings are also reported but should be interpreted carefully.
+// This test does not attempt to optimize the distribution of the vectors to
+// minimize communication costs.  Moreover, 2D random distribution of nonzeros
+// can lead to high communication volume; a better 2D approach would be a 
+// block-based approach that better aligns vector entries with matrix entries.
+
+#include "Tpetra_Core.hpp"
+#include "Tpetra_Map.hpp"
+#include "Tpetra_CrsMatrix.hpp"
+#include "Tpetra_Vector.hpp"
+#include "Teuchos_TimeMonitor.hpp"
+
+// Class to generate, distribute and apply nonzeros 
+template <typename scalar_t, typename gno_t>
+class generatedNonzeros
+{
+public:
+  using map_t = Tpetra::Map<>;
+  using vector_t = Tpetra::Vector<scalar_t>;
+
+  // Randomly generate all nonzeros for the entire matrix on every processor
+  // Values are in the range 0.1 to 10.1;
+  generatedNonzeros(
+    size_t nRows_, size_t nCols_, size_t nnz,
+    const Teuchos::RCP<const Teuchos::Comm<int> > &comm_
+  ) :
+    nRows(nRows_), nCols(nCols_), comm(comm_)
+  {
+    // Synchronize RNG; want all processors to generate the same nonzeros
+    srand(1);
+
+    // use a map to remove duplicates and sort entries by row
+    for (size_t n = 0; n < nnz; n++) {
+      gno_t i = std::rand() % nRows;
+      gno_t j = std::rand() % nCols;
+      scalar_t val = 0.1 + scalar_t(std::rand() % 10);
+      nzmap[std::make_pair(i,j)] = val;
+    }
+
+    nNz = nzmap.size();
+  }
+
+  // Select nonzeros from nzmap that are to be assigned to this processor
+  // Create CRS data structures to ease matrix construction
+  void getMyNonzeros(
+    const int distribution,  // flag indicating how to distribute the nonzeros
+                             //   == 1 --> row-wise (Trilinos default)
+                             //   == 2 --> column-wise
+                             //   == 3 --> randomly assign nonzeros to procs
+    Teuchos::Array<gno_t> &rowIdx,    // output:  unique row indices of
+                                      // nonzeros on this proc (sorted
+                                      // in ascending order)
+    Teuchos::Array<size_t> &nPerRow,  // output:  nPerRow[i] == 
+                                      // number of nonzeros
+                                      // in rowIdx[i] on this proc
+    Teuchos::Array<size_t> &offsets,  // output:  CRS offset array;
+                                      // length = length(rowIdx) + 1
+    Teuchos::Array<gno_t> &colIdx,    // output:  CRS column indices
+    Teuchos::Array<scalar_t> &val     // output:  nonzerovalues
+  ) const
+  {
+    int np = comm->getSize();
+    int me = comm->getRank();
+
+    // Precompute values needed for distribution 1 (linear row-wise)
+    gno_t nMyRows = nRows / np + (nRows % np > me);
+    gno_t myFirstRow = (me * (nRows / np) + std::min<gno_t>(nRows % np, me));
+    gno_t myLastRow = myFirstRow + nMyRows - 1;
+
+    // Precompute values needed for distribution 2 (linear column-wise)
+    gno_t nMyCols = nCols / np + (nCols % np > me);
+    gno_t myFirstCol = (me * (nCols / np) + std::min<gno_t>(nCols % np, me));
+    gno_t myLastCol = myFirstCol + nMyCols - 1;
+
+    // Produce CRS formatted arrays of nonzeros assigned to this processor
+    // given a requested Distribution
+    gno_t prev_i = std::numeric_limits<gno_t>::max();
+
+    // Loop over global nonzeros; insert those assigned to this processor in
+    // CRS arrays.
+    // Exploit fact that nzmap entries are sorted by row i to build CRS arrays
+    for (auto nz = nzmap.begin(); nz != nzmap.end(); nz++) {
+
+      gno_t i = nz->first.first;
+      gno_t j = nz->first.second;
+      scalar_t v = nz->second;
+
+      // Check whether nonzero (i,j) should be stored on this processor
+      bool mine = false;
+      switch (distribution) {
+      case 1:  // linear row-wise
+        if (i >= myFirstRow && i <= myLastRow) mine = true;
+        break;
+      case 2:  // linear col-wise
+        if (j >= myFirstCol && j <= myLastCol) mine = true;
+        break;
+      case 3:  // random
+        int randomproc = std::rand() % np;
+        if (me == randomproc) mine = true;
+        break;
+      }
+
+      if (mine) {
+        // nzmap entries are sorted by i; add a new i when different from prev i
+        if (i != prev_i) {
+          rowIdx.push_back(i);
+          nPerRow.push_back(0);
+          prev_i = i;
+        }
+        colIdx.push_back(j);
+        val.push_back(v);
+        nPerRow.back()++;
+      }
+    }
+
+    // Compute prefix sum in offsets array
+    offsets.resize(rowIdx.size() + 1);
+    offsets[0] = 0;
+    for (size_t row = 0; row < rowIdx.size(); row++)
+      offsets[row+1] = offsets[row] + nPerRow[row];
+  }
+
+  // Distribute nonzeros to processors, create CrsMatrix, then apply it to
+  // input vector x, giving y
+  // Time the SpMV application
+  void distributeAndApply(
+    const int distribution,  // flag indicating how to distribute the nonzeros
+                             //   == 1 --> row-wise (Trilinos default)
+                             //   == 2 --> column-wise
+                             //   == 3 --> randomly assign nonzeros to procs
+    const int nMatvecs,      // Number of SpMV to do (for timing test)
+    const vector_t &xvec,    // input:  domain vector
+    vector_t &yvec           // output: range vector
+  ) const
+  {
+    // Select this processor's nonzeros based on distribution
+    Teuchos::Array<size_t> offsets;
+    Teuchos::Array<size_t> nPerRow;
+    Teuchos::Array<gno_t> rowIdx;
+    Teuchos::Array<gno_t> colIdx;
+    Teuchos::Array<scalar_t> val;
+
+    getMyNonzeros(distribution, rowIdx, nPerRow, offsets, colIdx, val);
+  
+    // Build the CrsMatrix with the assigned nonzeros
+    using matrix_t = Tpetra::CrsMatrix<scalar_t>;
+  
+    size_t dummy = Teuchos::OrdinalTraits<Tpetra::global_size_t>::invalid();
+    Teuchos::RCP<const map_t> rowMap =
+             Teuchos::rcp(new map_t(dummy, rowIdx(), 0, comm));
+
+    Teuchos::RCP<matrix_t> Amat = Teuchos::rcp(new matrix_t(rowMap, nPerRow()));
+  
+    for (size_t r = 0; r < rowIdx.size(); r++) {
+      size_t tmp = offsets[r+1] - offsets[r];
+      Amat->insertGlobalValues(rowIdx[r], 
+                               colIdx(offsets[r],tmp), val(offsets[r],tmp));
+    }
+  
+    std::string tname;
+    {
+      switch (distribution) {
+        case 1:  tname = "fillComplete: 1 row-wise"; break;
+        case 2:  tname = "fillComplete: 2 column-wise"; break;
+        case 3:  tname = "fillComplete: 3 random 2D"; break;
+      }
+      auto timer = Teuchos::TimeMonitor::getNewTimer(tname);
+
+      Teuchos::TimeMonitor tt(*timer);
+      Amat->fillComplete(xvec.getMap(), yvec.getMap());
+    }
+
+    std::cout << comm->getRank() 
+              << ": nRows " << Amat->getNodeNumRows()
+              << "; nCols " << Amat->getNodeNumCols()
+              << "; nnz " << Amat->getNodeNumEntries()
+              << "; import " 
+              << (Amat->getGraph()->getImporter() == Teuchos::null ? 0 :
+                  Amat->getGraph()->getImporter()->getNumExportIDs())
+              << "; export " 
+              << (Amat->getGraph()->getExporter() == Teuchos::null ? 0 : 
+                  Amat->getGraph()->getExporter()->getNumExportIDs())
+              << std::endl;
+  
+    // Perform SpMV; do several iterations to get some timing info
+    {
+      switch (distribution) {
+        case 1:  tname = "SpMV: 1 row-wise"; break;
+        case 2:  tname = "SpMV: 2 column-wise"; break;
+        case 3:  tname = "SpMV: 3 random 2D"; break;
+      }
+  
+      auto timer = Teuchos::TimeMonitor::getNewTimer(tname);
+      for (int n = 0; n < nMatvecs; n++) {
+        Teuchos::TimeMonitor tt(*timer);
+        Amat->apply(xvec, yvec);
+      }
+    }
+  }
+
+private:
+
+  using coord = std::pair<gno_t, gno_t>;
+  struct compareCoord {  // sort nonzeros by row, then column
+    bool operator() (const coord &lhs, const coord &rhs) const
+    { if (lhs.first < rhs.first) return true;
+      if ((lhs.first == rhs.first) && (lhs.second < rhs.second)) return true;
+      return false;
+    }
+  };
+  std::map<coord, scalar_t, compareCoord> nzmap;  // sorted global nonzeros
+  
+  size_t nRows, nCols, nNz;
+  const Teuchos::RCP<const Teuchos::Comm<int> > comm;
+
+};
+ 
+////////////////////////////////////////////////////////////////////////////
+
+int main(int narg, char *arg[]) 
+{
+  Tpetra::ScopeGuard scope(&narg, &arg);
+  Teuchos::RCP<const Teuchos::Comm<int> > comm = Tpetra::getDefaultComm();
+
+  using scalar_t = Tpetra::Details::DefaultTypes::scalar_type; 
+  using gno_t = Tpetra::Map<>::global_ordinal_type;
+
+  int me = comm->getRank();
+  int np = comm->getSize();
+
+  const int nMatvecs = 1000;
+  size_t nRows = np * 100;
+  size_t nCols = np * 200;
+  size_t nNz = np * 1000;
+
+  // Create random nonzeros -- all global nonzeros generated on every processor
+  generatedNonzeros<scalar_t, gno_t> gNz(nRows, nCols, nNz, comm);
+
+  // Create vectors; use Trilinos default range and domain maps
+  // These vectors do not optimize communication for the random 2D distribution
+  using map_t = Tpetra::Map<>;
+  using vector_t = Tpetra::Vector<scalar_t>;
+
+  Teuchos::RCP<const map_t> rangeMap = Teuchos::rcp(new map_t(nRows, 0, comm));
+  vector_t yvec(rangeMap);
+
+  Teuchos::RCP<const map_t> domainMap = Teuchos::rcp(new map_t(nCols, 0, comm));
+  vector_t xvec(domainMap);
+  xvec.randomize();
+
+  // Row-wise 1D distribution
+  gNz.distributeAndApply(1, nMatvecs, xvec, yvec);
+  scalar_t row1DNorm1 = yvec.norm2();
+  scalar_t row1DNorm2 = yvec.norm2();
+  scalar_t row1DNormInf = yvec.normInf();
+  if (me == 0) 
+    std::cout << "Row-wise 1D distribution:  norm1 " << row1DNorm1
+              << "; norm2 " << row1DNorm2
+              << "; norminf " << row1DNormInf
+              << std::endl;
+
+  // Column-wise 1D distribution
+  gNz.distributeAndApply(2, nMatvecs, xvec, yvec);
+  scalar_t col1DNorm1 = yvec.norm1();
+  scalar_t col1DNorm2 = yvec.norm2();
+  scalar_t col1DNormInf = yvec.normInf();
+  if (me == 0) 
+    std::cout << "Col-wise 1D distribution:  norm1 " << col1DNorm1
+              << "; norm2 " << col1DNorm2
+              << "; norminf " << col1DNormInf
+              << std::endl;
+
+  // Random 2D distribution
+  gNz.distributeAndApply(3, nMatvecs, xvec, yvec);
+  scalar_t random2DNorm1 = yvec.norm1();
+  scalar_t random2DNorm2 = yvec.norm2();
+  scalar_t random2DNormInf = yvec.normInf();
+  if (me == 0) 
+    std::cout << "Random 2D distribution:   norm1 " << random2DNorm1
+              << "; norm2 " << random2DNorm2
+              << "; norminf " << random2DNormInf
+              << std::endl;
+
+  // Check results
+  int ierr = 0;
+
+  const scalar_t epsilon = 0.0000001;
+  if (std::abs(col1DNorm2 - row1DNorm2) > epsilon) {
+    ierr++;
+    if (me == 0) 
+      std::cout << "FAIL:  column-wise 1D norm " << col1DNorm2
+                << " - " << row1DNorm2 << " row-wise 1D norm" 
+                << " = " << std::abs(col1DNorm2 - row1DNorm2) << std::endl;
+  }
+
+  if (std::abs(random2DNorm2 - row1DNorm2) > epsilon) {
+    ierr++;
+    if (me == 0) 
+      std::cout << "FAIL:  random 2D norm " << random2DNorm2
+                << " = " << row1DNorm2 << " row-wise 1D norm" 
+                << " = " << std::abs(random2DNorm2 - row1DNorm2) << std::endl;
+  }
+
+  if (ierr == 0 && me == 0) 
+    std::cout << "PASS" << std::endl;
+
+  Teuchos::TimeMonitor::summarize();
+  return ierr;
+}
+

From ded23463582941c8e78d9d4ebd98304aa9c878f1 Mon Sep 17 00:00:00 2001
From: "K. Devine" <kddevin@sandia.gov>
Date: Wed, 15 Apr 2020 13:41:23 -0600
Subject: [PATCH 2/8] Fixed typo

---
 packages/tpetra/core/test/CrsMatrix/CrsMatrix_2DRandomDist.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_2DRandomDist.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_2DRandomDist.cpp
index f966fd938d16..bddeabc16c1f 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_2DRandomDist.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_2DRandomDist.cpp
@@ -307,7 +307,7 @@ int main(int narg, char *arg[])
 
   // Row-wise 1D distribution
   gNz.distributeAndApply(1, nMatvecs, xvec, yvec);
-  scalar_t row1DNorm1 = yvec.norm2();
+  scalar_t row1DNorm1 = yvec.norm1();
   scalar_t row1DNorm2 = yvec.norm2();
   scalar_t row1DNormInf = yvec.normInf();
   if (me == 0) 

From fd33141f344db38e877a07db4855b00068385b50 Mon Sep 17 00:00:00 2001
From: Eric Phipps <etphipp@sandia.gov>
Date: Fri, 15 May 2020 13:46:37 -0600
Subject: [PATCH 3/8] Sacado:  Fix performance bug with hierarchical-dfad
 enabled.

Direct writes between view's had a performance bug when hierarchical-dfad was
enabled where the correct thread striding wasn't being used.  This fixes that.
---
 packages/sacado/src/Sacado_Fad_ViewStorage.hpp     | 14 ++++++++++++--
 .../src/new_design/Sacado_Fad_Exp_ViewStorage.hpp  | 14 ++++++++++++--
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/packages/sacado/src/Sacado_Fad_ViewStorage.hpp b/packages/sacado/src/Sacado_Fad_ViewStorage.hpp
index 1364b745d40e..3b41e3e7d98c 100644
--- a/packages/sacado/src/Sacado_Fad_ViewStorage.hpp
+++ b/packages/sacado/src/Sacado_Fad_ViewStorage.hpp
@@ -56,6 +56,14 @@ namespace Sacado {
 
   namespace Fad {
 
+#ifndef SACADO_FAD_DERIV_LOOP
+#if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__)
+#define SACADO_FAD_DERIV_LOOP(I,SZ) for (int I=threadIdx.x; I<SZ; I+=blockDim.x)
+#else
+#define SACADO_FAD_DERIV_LOOP(I,SZ) for (int I=0; I<SZ; ++I)
+#endif
+#endif
+
     /*!
      * \brief Derivative array storage class that is a view into a contiguous
      * memory allocation.  It does not provide proper value semantics and
@@ -110,10 +118,12 @@ namespace Sacado {
         if (this != &x) {
           *val_ = *x.val_;
           if (stride_one)
-            for (int i=0; i<sz_.value; ++i)
+            //for (int i=0; i<sz_.value; ++i)
+            SACADO_FAD_DERIV_LOOP(i,sz_.value)
               dx_[i] = x.dx_[i];
           else
-            for (int i=0; i<sz_.value; ++i)
+            //for (int i=0; i<sz_.value; ++i)
+            SACADO_FAD_DERIV_LOOP(i,sz_.value)
               dx_[i*stride_.value] = x.dx_[i*x.stride_.value];
         }
         return *this;
diff --git a/packages/sacado/src/new_design/Sacado_Fad_Exp_ViewStorage.hpp b/packages/sacado/src/new_design/Sacado_Fad_Exp_ViewStorage.hpp
index 1f651ad2de23..6084c4d3b0c0 100644
--- a/packages/sacado/src/new_design/Sacado_Fad_Exp_ViewStorage.hpp
+++ b/packages/sacado/src/new_design/Sacado_Fad_Exp_ViewStorage.hpp
@@ -42,6 +42,14 @@ namespace Sacado {
   namespace Fad {
   namespace Exp {
 
+#ifndef SACADO_FAD_DERIV_LOOP
+#if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD) && !defined(SACADO_DISABLE_CUDA_IN_KOKKOS) && defined(__CUDA_ARCH__)
+#define SACADO_FAD_DERIV_LOOP(I,SZ) for (int I=threadIdx.x; I<SZ; I+=blockDim.x)
+#else
+#define SACADO_FAD_DERIV_LOOP(I,SZ) for (int I=0; I<SZ; ++I)
+#endif
+#endif
+
     // Class representing a pointer to ViewFad so that &ViewFad is supported
     template <typename T, unsigned sl, unsigned ss, typename U>
     class ViewFadPtr;
@@ -118,10 +126,12 @@ namespace Sacado {
                 reinterpret_cast<const volatile char&>(x)))) {
           *val_ = *x.val_;
           if (stride_one)
-            for (int i=0; i<sz_.value; ++i)
+            //for (int i=0; i<sz_.value; ++i)
+            SACADO_FAD_DERIV_LOOP(i,sz_.value)
               dx_[i] = x.dx_[i];
           else
-            for (int i=0; i<sz_.value; ++i)
+            //for (int i=0; i<sz_.value; ++i)
+            SACADO_FAD_DERIV_LOOP(i,sz_.value)
               dx_[i*stride_.value] = x.dx_[i*x.stride_.value];
         }
         return *this;

From d00bec0e56b4d4b5c91a84425885291090f25a87 Mon Sep 17 00:00:00 2001
From: Eric Phipps <etphipp@sandia.gov>
Date: Fri, 15 May 2020 13:49:05 -0600
Subject: [PATCH 4/8] Sacado:  rework mat-vec and advection performance tests.

Tests were reorganized so that different kinds of hierarchical parallelism
can be used in the same test.  The old versions are still in the repo, but
are not compiled.  Once I determine the old ones are no longer needed, I'll
remove them in a later commit.
---
 .../sacado/test/performance/CMakeLists.txt    |  47 +-
 .../test/performance/advection/CMakeLists.txt |  22 +
 .../test/performance/advection/advection.cpp  | 424 +++++++++++
 .../test/performance/advection/advection.hpp  |  50 ++
 .../advection/advection_hierarchical.cpp      | 230 ++++++
 .../advection/advection_hierarchical.hpp      |  38 +
 .../advection/advection_hierarchical_dfad.cpp | 237 ++++++
 .../advection/advection_hierarchical_dfad.hpp |  39 +
 .../test/performance/advection/common.hpp     | 389 ++++++++++
 .../test/performance/advection/driver.cpp     | 164 +++++
 .../advection_const_basis/CMakeLists.txt      |  22 +
 .../advection_const_basis/advection.cpp       | 427 +++++++++++
 .../advection_const_basis/advection.hpp       |  50 ++
 .../advection_hierarchical.cpp                | 232 ++++++
 .../advection_hierarchical.hpp                |  38 +
 .../advection_hierarchical_dfad.cpp           | 239 ++++++
 .../advection_hierarchical_dfad.hpp           |  39 +
 .../advection_const_basis/common.hpp          | 350 +++++++++
 .../advection_const_basis/driver.cpp          | 164 +++++
 .../test/performance/mat_vec/CMakeLists.txt   |  22 +
 .../test/performance/mat_vec/common.hpp       |  45 ++
 .../test/performance/mat_vec/driver.cpp       | 386 ++++++++++
 .../mat_vec/fad_kokkos_mat_vec_perf.cpp       | 680 ++++++++++++++++++
 .../test/performance/mat_vec/mat_vec.cpp      | 552 ++++++++++++++
 .../test/performance/mat_vec/mat_vec.hpp      |  62 ++
 .../mat_vec/mat_vec_hierarchical.cpp          | 190 +++++
 .../mat_vec/mat_vec_hierarchical.hpp          |  37 +
 .../mat_vec/mat_vec_hierarchical_dfad.cpp     | 285 ++++++++
 .../mat_vec/mat_vec_hierarchical_dfad.hpp     |  43 ++
 29 files changed, 5482 insertions(+), 21 deletions(-)
 create mode 100644 packages/sacado/test/performance/advection/CMakeLists.txt
 create mode 100644 packages/sacado/test/performance/advection/advection.cpp
 create mode 100644 packages/sacado/test/performance/advection/advection.hpp
 create mode 100644 packages/sacado/test/performance/advection/advection_hierarchical.cpp
 create mode 100644 packages/sacado/test/performance/advection/advection_hierarchical.hpp
 create mode 100644 packages/sacado/test/performance/advection/advection_hierarchical_dfad.cpp
 create mode 100644 packages/sacado/test/performance/advection/advection_hierarchical_dfad.hpp
 create mode 100644 packages/sacado/test/performance/advection/common.hpp
 create mode 100644 packages/sacado/test/performance/advection/driver.cpp
 create mode 100644 packages/sacado/test/performance/advection_const_basis/CMakeLists.txt
 create mode 100644 packages/sacado/test/performance/advection_const_basis/advection.cpp
 create mode 100644 packages/sacado/test/performance/advection_const_basis/advection.hpp
 create mode 100644 packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp
 create mode 100644 packages/sacado/test/performance/advection_const_basis/advection_hierarchical.hpp
 create mode 100644 packages/sacado/test/performance/advection_const_basis/advection_hierarchical_dfad.cpp
 create mode 100644 packages/sacado/test/performance/advection_const_basis/advection_hierarchical_dfad.hpp
 create mode 100644 packages/sacado/test/performance/advection_const_basis/common.hpp
 create mode 100644 packages/sacado/test/performance/advection_const_basis/driver.cpp
 create mode 100644 packages/sacado/test/performance/mat_vec/CMakeLists.txt
 create mode 100644 packages/sacado/test/performance/mat_vec/common.hpp
 create mode 100644 packages/sacado/test/performance/mat_vec/driver.cpp
 create mode 100644 packages/sacado/test/performance/mat_vec/fad_kokkos_mat_vec_perf.cpp
 create mode 100644 packages/sacado/test/performance/mat_vec/mat_vec.cpp
 create mode 100644 packages/sacado/test/performance/mat_vec/mat_vec.hpp
 create mode 100644 packages/sacado/test/performance/mat_vec/mat_vec_hierarchical.cpp
 create mode 100644 packages/sacado/test/performance/mat_vec/mat_vec_hierarchical.hpp
 create mode 100644 packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.cpp
 create mode 100644 packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.hpp

diff --git a/packages/sacado/test/performance/CMakeLists.txt b/packages/sacado/test/performance/CMakeLists.txt
index 00f11c1f4832..f5611b27e290 100644
--- a/packages/sacado/test/performance/CMakeLists.txt
+++ b/packages/sacado/test/performance/CMakeLists.txt
@@ -76,29 +76,34 @@ ENDIF()
 
 IF (Sacado_ENABLE_TeuchosCore AND Sacado_ENABLE_KokkosCore)
 
-  TRIBITS_ADD_EXECUTABLE(
-    fad_kokkos_view
-    SOURCES fad_kokkos_view.cpp
-    COMM serial mpi
-    )
-
-  # These tests do not compile with gcc 4.7.x because it doesn't properly
-  # support lambdas.  See github issue #854
-  IF(NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.8")))
-    TRIBITS_ADD_EXECUTABLE(
-      fad_kokkos_hierarchical
-      SOURCES fad_kokkos_hierarchical.cpp
-      COMM serial mpi
-      )
-
-#    TRIBITS_ADD_EXECUTABLE(
-#      fad_kokkos_mat_vec_perf
-#      SOURCES fad_kokkos_mat_vec_perf.cpp
-#      COMM serial mpi
-#      )
-  ENDIF()
+  # Disable these tests as they have been replaced by mat_vec, advection* below
+
+  # TRIBITS_ADD_EXECUTABLE(
+  #   fad_kokkos_view
+  #   SOURCES fad_kokkos_view.cpp
+  #   COMM serial mpi
+  #   )
+
+  # # These tests do not compile with gcc 4.7.x because it doesn't properly
+  # # support lambdas.  See github issue #854
+  # IF(NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.8")))
+  #   TRIBITS_ADD_EXECUTABLE(
+  #     fad_kokkos_hierarchical
+  #     SOURCES fad_kokkos_hierarchical.cpp
+  #     COMM serial mpi
+  #     )
+
+  #  TRIBITS_ADD_EXECUTABLE(
+  #    fad_kokkos_mat_vec_perf
+  #    SOURCES fad_kokkos_mat_vec_perf.cpp
+  #    COMM serial mpi
+  #    )
+  # ENDIF()
 
 ENDIF()
 
 ADD_SUBDIRECTORY(fenl_assembly)
 ADD_SUBDIRECTORY(fenl_assembly_view)
+ADD_SUBDIRECTORY(mat_vec)
+ADD_SUBDIRECTORY(advection)
+ADD_SUBDIRECTORY(advection_const_basis)
diff --git a/packages/sacado/test/performance/advection/CMakeLists.txt b/packages/sacado/test/performance/advection/CMakeLists.txt
new file mode 100644
index 000000000000..51d0f4f56dfc
--- /dev/null
+++ b/packages/sacado/test/performance/advection/CMakeLists.txt
@@ -0,0 +1,22 @@
+ASSERT_DEFINED(PACKAGE_SOURCE_DIR CMAKE_CURRENT_SOURCE_DIR)
+
+INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+
+IF (Sacado_ENABLE_KokkosCore AND Sacado_ENABLE_TeuchosCore)
+
+  IF(NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.8")))
+
+    TRIBITS_ADD_EXECUTABLE(
+      FadAdvectionHierarchicalTest
+      SOURCES common.hpp
+              advection.hpp advection.cpp
+              advection_hierarchical.hpp advection_hierarchical.cpp
+              advection_hierarchical_dfad.hpp advection_hierarchical_dfad.cpp
+              driver.cpp
+      COMM serial mpi
+      )
+
+  ENDIF()
+
+ENDIF()
diff --git a/packages/sacado/test/performance/advection/advection.cpp b/packages/sacado/test/performance/advection/advection.cpp
new file mode 100644
index 000000000000..8b4a803aacda
--- /dev/null
+++ b/packages/sacado/test/performance/advection/advection.cpp
@@ -0,0 +1,424 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#include "Sacado.hpp"
+#include "advection.hpp"
+#include "common.hpp"
+
+#include "impl/Kokkos_Timer.hpp"
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_fad_flat(const FluxView& flux, const WgbView& wgb,
+                  const SrcView& src, const WbsView& wbs,
+                  const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>( 0,num_cells ),
+                       KOKKOS_LAMBDA (const size_t cell)
+  {
+    scalar_type value, value2;
+    for (int basis=0; basis<num_basis; ++basis) {
+      value = 0.0;
+      value2 = 0.0;
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim)
+          value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
+        value2 += src(cell,qp)*wbs(cell,basis,qp);
+      }
+      residual(cell,basis) = value+value2;
+    }
+  });
+}
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_fad_scratch(const FluxView& flux, const WgbView& wgb,
+                     const SrcView& src, const WbsView& wbs,
+                     const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type team_member;
+  typedef Kokkos::View<scalar_type*, typename execution_space::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > tmp_scratch_type;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  const int vector_size  = 1;
+  const int team_size    = is_cuda_space<execution_space>::value ? 32 : 1;
+  const int fad_size     = Kokkos::dimension_scalar(residual);
+  const size_t range     = (num_cells+team_size-1)/team_size;
+  const size_t bytes     = 2*tmp_scratch_type::shmem_size(team_size,fad_size);
+  policy_type policy(range,team_size,vector_size);
+
+  Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)),
+                       KOKKOS_LAMBDA (const team_member& team)
+  {
+    const int team_rank = team.team_rank();
+    tmp_scratch_type value(team.team_scratch(0), team_size, fad_size);
+    tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size);
+    const size_t cell = team.league_rank()*team_size + team_rank;
+    if (cell < num_cells) {
+      for (int basis=0; basis<num_basis; ++basis) {
+        value(team_rank) = 0.0;
+        value2(team_rank) = 0.0;
+        for (int qp=0; qp<num_points; ++qp) {
+          for (int dim=0; dim<num_dim; ++dim)
+            value(team_rank) += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
+          value2(team_rank) += src(cell,qp)*wbs(cell,basis,qp);
+        }
+        residual(cell,basis) = value(team_rank)+value2(team_rank);
+      }
+    }
+  });
+}
+
+template<int N, typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_analytic_flat(const FluxView& flux, const WgbView& wgb,
+                       const SrcView& src, const WbsView& wbs,
+                       const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>( 0,num_cells ),
+                       KOKKOS_LAMBDA (const size_t cell)
+  {
+    scalar_type value[N+1],value2[N+1];
+    for (int basis=0; basis<num_basis; ++basis) {
+      for (int k=0; k<N+1; ++k) {
+        value[k] = 0.0;
+        value2[k] = 0.0;
+      }
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim) {
+          const scalar_type flux_val = flux(cell,qp,dim,N);
+          const scalar_type wgb_val = wgb(cell,basis,qp,dim,N);
+          value[N] += flux_val*wgb_val;
+          for(int k=0; k<N; k++)
+            value[k] +=
+              flux_val*wgb(cell,basis,qp,dim,k)+flux(cell,qp,dim,k)*wgb_val;
+        }
+        const scalar_type src_val = src(cell,qp,N);
+        const scalar_type wbs_val = wbs(cell,basis,qp,N);
+        value2[N] += src_val*wbs_val;
+        for(int k=0; k<N; k++)
+          value2[k] += src_val*wbs(cell,basis,qp,k)+src(cell,qp,k)*wbs_val;
+      }
+      for(int k=0; k<=N; k++)
+        residual(cell,basis,k) = value[k]+value2[k];
+    }
+  });
+}
+
+template<int N, typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_analytic_team(const FluxView& flux, const WgbView& wgb,
+                       const SrcView& src, const WbsView& wbs,
+                       const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type team_member;
+  typedef Kokkos::View<scalar_type[N+1], typename execution_space::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > tmp_scratch_type;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  const size_t bytes     = 2*tmp_scratch_type::shmem_size();
+  policy_type policy(num_cells,num_basis,32);
+  Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerThread(bytes)),
+                       KOKKOS_LAMBDA (const team_member& team)
+  {
+    tmp_scratch_type value(team.thread_scratch(0));
+    tmp_scratch_type value2(team.thread_scratch(0));
+    const size_t cell = team.league_rank();
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_basis),
+                         [&] (const int& basis)
+    {
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N+1),
+                           [&] (const int& k)
+      {
+        value(k) = 0;
+        value2(k) = 0;
+      });
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim) {
+          const scalar_type flux_val = flux(cell,qp,dim,N);
+          const scalar_type wgb_val = wgb(cell,basis,qp,dim,N);
+          Kokkos::single(Kokkos::PerThread(team), [&] () {
+            value[N] += flux_val*wgb_val;
+          });
+          Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N),
+                               [&] (const int& k)
+          {
+            value[k] +=
+              flux_val*wgb(cell,basis,qp,dim,k)+flux(cell,qp,dim,k)*wgb_val;
+          });
+        }
+        const scalar_type src_val = src(cell,qp,N);
+        const scalar_type wbs_val = wbs(cell,basis,qp,N);
+        Kokkos::single(Kokkos::PerThread(team), [&] () {
+          value2[N] += src_val*wbs_val;
+        });
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N),
+                             [&] (const int& k)
+        {
+          value2[k] += src_val*wbs(cell,basis,qp,k)+src(cell,qp,k)*wbs_val;
+        });
+      }
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N+1),
+                           [&] (const int& k)
+      {
+        residual(cell,basis,k) = value[k]+value2[k];
+      });
+    });
+  });
+}
+
+template <typename FadType, int N, typename ExecSpace>
+double time_fad_flat(int ncells, int num_basis, int num_points, int ndim,
+                     int ntrial, bool check)
+{
+  typedef Kokkos::View<FadType****,ExecSpace> t_4DView;
+  typedef Kokkos::View<FadType***,ExecSpace> t_3DView;
+  typedef Kokkos::View<FadType**,ExecSpace> t_2DView;
+
+  t_4DView wgb("",ncells,num_basis,num_points,ndim,N+1);
+  t_3DView flux("",ncells,num_points,ndim,N+1);
+  t_3DView wbs("",ncells,num_basis,num_points,N+1);
+  t_2DView src("",ncells,num_points,N+1);
+  t_2DView residual("",ncells,num_basis,N+1);
+  init_fad(wgb, wbs, flux, src, residual);
+
+  // Run once to warm up, complete any UVM transfers
+  run_fad_flat(flux, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_fad_flat(flux, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+template <typename FadType, int N, typename ExecSpace>
+double time_fad_scratch(int ncells, int num_basis, int num_points, int ndim,
+                        int ntrial, bool check)
+{
+  typedef Kokkos::View<FadType****,ExecSpace> t_4DView;
+  typedef Kokkos::View<FadType***,ExecSpace> t_3DView;
+  typedef Kokkos::View<FadType**,ExecSpace> t_2DView;
+
+  t_4DView wgb("",ncells,num_basis,num_points,ndim,N+1);
+  t_3DView flux("",ncells,num_points,ndim,N+1);
+  t_3DView wbs("",ncells,num_basis,num_points,N+1);
+  t_2DView src("",ncells,num_points,N+1);
+  t_2DView residual("",ncells,num_basis,N+1);
+  init_fad(wgb, wbs, flux, src, residual);
+
+  // Run once to warm up, complete any UVM transfers
+  run_fad_scratch(flux, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_fad_scratch(flux, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+template <int N, typename ExecSpace>
+double time_analytic_flat(int ncells, int num_basis, int num_points, int ndim,
+                          int ntrial, bool check)
+{
+  typedef Kokkos::View<double****[N+1],ExecSpace> t_4DView;
+  typedef Kokkos::View<double***[N+1],ExecSpace> t_3DView;
+  typedef Kokkos::View<double**[N+1],ExecSpace> t_2DView;
+
+  t_4DView wgb("",ncells,num_basis,num_points,ndim);
+  t_3DView flux("",ncells,num_points,ndim);
+  t_3DView wbs("",ncells,num_basis,num_points);
+  t_2DView src("",ncells,num_points);
+  t_2DView residual("",ncells,num_basis);
+  init_array(wgb, wbs, flux, src, residual);
+
+  // Run once to warm up, complete any UVM transfers
+  run_analytic_flat<N>(flux, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_analytic_flat<N>(flux, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+template <int N, typename ExecSpace>
+double time_analytic_const(int ncells, int num_basis, int num_points, int ndim,
+                           int ntrial, bool check)
+{
+  typedef Kokkos::View<double****[N+1],ExecSpace> t_4DView;
+  typedef Kokkos::View<double***[N+1],ExecSpace> t_3DView;
+  typedef Kokkos::View<double**[N+1],ExecSpace> t_2DView;
+  typedef Kokkos::View<const double***[N+1],ExecSpace,Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_3DView_const;
+
+  t_4DView wgb("",ncells,num_basis,num_points,ndim);
+  t_3DView flux("",ncells,num_points,ndim);
+  t_3DView wbs("",ncells,num_basis,num_points);
+  t_2DView src("",ncells,num_points);
+  t_2DView residual("",ncells,num_basis);
+  init_array(wgb, wbs, flux, src, residual);
+
+  t_3DView_const flux_const = flux;
+
+  // Run once to warm up, complete any UVM transfers
+  run_analytic_flat<N>(flux_const, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_analytic_flat<N>(flux_const, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+template <int N, typename ExecSpace>
+double time_analytic_team(int ncells, int num_basis, int num_points, int ndim,
+                          int ntrial, bool check)
+{
+  typedef Kokkos::View<double****[N+1],ExecSpace> t_4DView;
+  typedef Kokkos::View<double***[N+1],ExecSpace> t_3DView;
+  typedef Kokkos::View<double**[N+1],ExecSpace> t_2DView;
+  typedef Kokkos::View<const double***[N+1],ExecSpace,Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_3DView_const;
+
+  t_4DView wgb("",ncells,num_basis,num_points,ndim);
+  t_3DView flux("",ncells,num_points,ndim);
+  t_3DView wbs("",ncells,num_basis,num_points);
+  t_2DView src("",ncells,num_points);
+  t_2DView residual("",ncells,num_basis);
+  init_array(wgb, wbs, flux, src, residual);
+
+  t_3DView_const flux_const = flux;
+
+  // Run once to warm up, complete any UVM transfers
+  run_analytic_team<N>(flux_const, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_analytic_team<N>(flux_const, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+#define INST_FUNC_FAD_N_DEV(FAD,N,DEV) \
+  template double time_fad_flat< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
+  template double time_fad_scratch< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
+
+#define INST_FUNC_N_DEV(N,DEV) \
+  INST_FUNC_FAD_N_DEV(SFadType,N,DEV) \
+  INST_FUNC_FAD_N_DEV(DFadType,N,DEV) \
+  template double time_analytic_flat< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
+  template double time_analytic_const< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
+  template double time_analytic_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
+
+#define INST_FUNC_DEV(DEV) \
+  INST_FUNC_N_DEV( fad_dim, DEV )
+
+#ifdef KOKKOS_ENABLE_SERIAL
+INST_FUNC_DEV(Kokkos::Serial)
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+INST_FUNC_DEV(Kokkos::OpenMP)
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+INST_FUNC_DEV(Kokkos::Threads)
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+INST_FUNC_DEV(Kokkos::Cuda)
+#endif
diff --git a/packages/sacado/test/performance/advection/advection.hpp b/packages/sacado/test/performance/advection/advection.hpp
new file mode 100644
index 000000000000..adca588cee11
--- /dev/null
+++ b/packages/sacado/test/performance/advection/advection.hpp
@@ -0,0 +1,50 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#pragma once
+
+template <typename FadType, int N, typename ExecSpace>
+double time_fad_flat(int ncells, int num_basis, int num_points, int ndim,
+                     int ntrial, bool check);
+
+template <typename FadType, int N, typename ExecSpace>
+double time_fad_scratch(int ncells, int num_basis, int num_points, int ndim,
+                        int ntrial, bool check);
+
+template <int N, typename ExecSpace>
+double time_analytic_flat(int ncells, int num_basis, int num_points, int ndim,
+                          int ntrial, bool check);
+
+template <int N, typename ExecSpace>
+double time_analytic_const(int ncells, int num_basis, int num_points, int ndim,
+                           int ntrial, bool check);
+
+template <int N, typename ExecSpace>
+double time_analytic_team(int ncells, int num_basis, int num_points, int ndim,
+                          int ntrial, bool check);
diff --git a/packages/sacado/test/performance/advection/advection_hierarchical.cpp b/packages/sacado/test/performance/advection/advection_hierarchical.cpp
new file mode 100644
index 000000000000..f6377c207dbd
--- /dev/null
+++ b/packages/sacado/test/performance/advection/advection_hierarchical.cpp
@@ -0,0 +1,230 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#define SACADO_VIEW_CUDA_HIERARCHICAL 1
+#define SACADO_ALIGN_SFAD 1
+
+#include "Sacado.hpp"
+#include "advection_hierarchical.hpp"
+#include "common.hpp"
+
+#include "impl/Kokkos_Timer.hpp"
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_fad_hierarchical_flat(const FluxView& flux, const WgbView& wgb,
+                               const SrcView& src, const WbsView& wbs,
+                               const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+  typedef typename Kokkos::ThreadLocalScalarType<ResidualView>::type local_scalar_type;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type team_member;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  const bool is_cuda     = is_cuda_space<execution_space>::value;
+  const int vector_size  = is_cuda ? 32 : 1;
+  const int team_size    = is_cuda ? 256/vector_size : 1;
+  const size_t range     = (num_cells+team_size-1)/team_size;
+
+  policy_type policy(range,team_size,vector_size);
+  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team)
+  {
+    const size_t cell = team.league_rank()*team_size + team.team_rank();
+    local_scalar_type value, value2;
+    for (int basis=0; basis<num_basis; ++basis) {
+      value = 0.0;
+      value2 = 0.0;
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim)
+          value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
+        value2 += src(cell,qp)*wbs(cell,basis,qp);
+      }
+      residual(cell,basis) = value+value2;
+    }
+  });
+}
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_fad_hierarchical_team(const FluxView& flux, const WgbView& wgb,
+                               const SrcView& src, const WbsView& wbs,
+                               const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+  typedef typename Kokkos::ThreadLocalScalarType<ResidualView>::type local_scalar_type;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type team_member;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  const bool is_cuda     = is_cuda_space<execution_space>::value;
+  const int vector_size  = is_cuda ? 32 : 1;
+  const int team_size    = is_cuda ? 256/vector_size : 1;
+
+  policy_type policy(num_cells,team_size,vector_size);
+  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team)
+  {
+    const int team_rank = team.team_rank();
+    const size_t cell = team.league_rank();
+    local_scalar_type value, value2;
+    for (int basis=team_rank; basis<num_basis; basis+=team_size) {
+      value = 0.0;
+      value2 = 0.0;
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim)
+          value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
+        value2 += src(cell,qp)*wbs(cell,basis,qp);
+      }
+      residual(cell,basis) = value+value2;
+    }
+  });
+}
+
+template <int N, typename ExecSpace>
+double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points,
+                                  int ndim, int ntrial, bool check)
+{
+  typedef Sacado::Fad::SFad<double,N> FadType;
+
+  static const int FadStride = is_cuda_space<ExecSpace>::value ? 32 : 1;
+#if defined(SACADO_ALIGN_SFAD)
+  static const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
+  typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
+#else
+  typedef FadType AlignedFadType;
+#endif
+
+  typedef typename ExecSpace::array_layout DefaultLayout;
+  typedef Kokkos::LayoutContiguous<DefaultLayout,FadStride> ContLayout;
+  typedef Kokkos::View<AlignedFadType****,ContLayout,ExecSpace> t_4DView;
+  typedef Kokkos::View<AlignedFadType***,ContLayout,ExecSpace> t_3DView;
+  typedef Kokkos::View<AlignedFadType**,ContLayout,ExecSpace> t_2DView;
+
+  t_4DView wgb("",ncells,num_basis,num_points,ndim,N+1);
+  t_3DView flux("",ncells,num_points,ndim,N+1);
+  t_3DView wbs("",ncells,num_basis,num_points,N+1);
+  t_2DView src("",ncells,num_points,N+1);
+  t_2DView residual("",ncells,num_basis,N+1);
+  init_fad(wgb, wbs, flux, src, residual);
+
+  // Run once to warm up, complete any UVM transfers
+  run_fad_hierarchical_flat(flux, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_fad_hierarchical_flat(flux, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+template <int N, typename ExecSpace>
+double time_fad_hierarchical_team(int ncells, int num_basis, int num_points,
+                                  int ndim, int ntrial, bool check)
+{
+  typedef Sacado::Fad::SFad<double,N> FadType;
+
+  static const int FadStride = is_cuda_space<ExecSpace>::value ? 32 : 1;
+#if defined(SACADO_ALIGN_SFAD)
+  static const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
+  typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
+#else
+  typedef FadType AlignedFadType;
+#endif
+
+  typedef typename ExecSpace::array_layout DefaultLayout;
+  typedef Kokkos::LayoutContiguous<DefaultLayout,FadStride> ContLayout;
+  typedef Kokkos::View<AlignedFadType****,ContLayout,ExecSpace> t_4DView;
+  typedef Kokkos::View<AlignedFadType***,ContLayout,ExecSpace> t_3DView;
+  typedef Kokkos::View<AlignedFadType**,ContLayout,ExecSpace> t_2DView;
+
+  t_4DView wgb("",ncells,num_basis,num_points,ndim,N+1);
+  t_3DView flux("",ncells,num_points,ndim,N+1);
+  t_3DView wbs("",ncells,num_basis,num_points,N+1);
+  t_2DView src("",ncells,num_points,N+1);
+  t_2DView residual("",ncells,num_basis,N+1);
+  init_fad(wgb, wbs, flux, src, residual);
+
+  // Run once to warm up, complete any UVM transfers
+  run_fad_hierarchical_team(flux, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_fad_hierarchical_team(flux, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+#define INST_FUNC_N_DEV(N,DEV) \
+  template double time_fad_hierarchical_flat< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
+  template double time_fad_hierarchical_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
+
+#define INST_FUNC_DEV(DEV) \
+  INST_FUNC_N_DEV( fad_dim, DEV )
+
+#ifdef KOKKOS_ENABLE_SERIAL
+INST_FUNC_DEV(Kokkos::Serial)
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+INST_FUNC_DEV(Kokkos::OpenMP)
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+INST_FUNC_DEV(Kokkos::Threads)
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+INST_FUNC_DEV(Kokkos::Cuda)
+#endif
diff --git a/packages/sacado/test/performance/advection/advection_hierarchical.hpp b/packages/sacado/test/performance/advection/advection_hierarchical.hpp
new file mode 100644
index 000000000000..f78f4316897b
--- /dev/null
+++ b/packages/sacado/test/performance/advection/advection_hierarchical.hpp
@@ -0,0 +1,38 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#pragma once
+
+template <int N, typename ExecSpace>
+double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points,
+                                  int ndim, int ntrial, bool check);
+
+template <int N, typename ExecSpace>
+double time_fad_hierarchical_team(int ncells, int num_basis, int num_points,
+                                  int ndim, int ntrial, bool check);
diff --git a/packages/sacado/test/performance/advection/advection_hierarchical_dfad.cpp b/packages/sacado/test/performance/advection/advection_hierarchical_dfad.cpp
new file mode 100644
index 000000000000..8e93b4a74256
--- /dev/null
+++ b/packages/sacado/test/performance/advection/advection_hierarchical_dfad.cpp
@@ -0,0 +1,237 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
+#define SACADO_KOKKOS_USE_MEMORY_POOL 1
+
+#include "Sacado.hpp"
+#include "advection_hierarchical_dfad.hpp"
+#include "common.hpp"
+
+#include "impl/Kokkos_Timer.hpp"
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_dfad_hierarchical_team(const FluxView& flux, const WgbView& wgb,
+                                const SrcView& src, const WbsView& wbs,
+                                const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type team_member;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  const bool is_cuda     = is_cuda_space<execution_space>::value;
+  const int vector_size  = is_cuda ? 32 : 1;
+  const int team_size    = is_cuda ? 256/vector_size : 1;
+
+  policy_type policy(num_cells,team_size,vector_size);
+  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team)
+  {
+    const int team_rank = team.team_rank();
+    const size_t cell = team.league_rank();
+    scalar_type value, value2;
+    for (int basis=team_rank; basis<num_basis; basis+=team_size) {
+      value = 0.0;
+      value2 = 0.0;
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim)
+          value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
+        value2 += src(cell,qp)*wbs(cell,basis,qp);
+      }
+      residual(cell,basis) = value+value2;
+    }
+  });
+}
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_dfad_hierarchical_team_scratch(
+  const FluxView& flux, const WgbView& wgb,
+  const SrcView& src, const WbsView& wbs,
+  const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type team_member;
+  typedef Kokkos::View<scalar_type*, typename execution_space::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > tmp_scratch_type;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  const bool is_cuda     = is_cuda_space<execution_space>::value;
+  const int vector_size  = is_cuda ? 32 : 1;
+  const int team_size    = is_cuda ? 256/vector_size : 1;
+  const int fad_size     = Kokkos::dimension_scalar(residual);
+  const size_t bytes     = 2*tmp_scratch_type::shmem_size(team_size,fad_size);
+  policy_type policy(num_cells,team_size,vector_size);
+
+  Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)),
+                       KOKKOS_LAMBDA (const team_member& team)
+  {
+    const int team_rank = team.team_rank();
+    const size_t cell = team.league_rank();
+    tmp_scratch_type value(team.team_scratch(0), team_size, fad_size);
+    tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size);
+    for (int basis=team_rank; basis<num_basis; basis+=team_size) {
+      value(team_rank) = 0.0;
+      value2(team_rank) = 0.0;
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim)
+          value(team_rank) += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
+        value2(team_rank) += src(cell,qp)*wbs(cell,basis,qp);
+      }
+      residual(cell,basis) = value(team_rank)+value2(team_rank);
+    }
+  });
+}
+
+template <int N, typename ExecSpace>
+double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points,
+                                   int ndim, int ntrial, bool check)
+{
+  typedef Sacado::Fad::DFad<double> FadType;
+
+  typedef typename ExecSpace::array_layout DefaultLayout;
+  typedef Kokkos::LayoutContiguous<DefaultLayout> ContLayout;
+  typedef Kokkos::View<FadType****,ContLayout,ExecSpace> t_4DView;
+  typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
+  typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
+
+  t_4DView wgb("",ncells,num_basis,num_points,ndim,N+1);
+  t_3DView flux("",ncells,num_points,ndim,N+1);
+  t_3DView wbs("",ncells,num_basis,num_points,N+1);
+  t_2DView src("",ncells,num_points,N+1);
+  t_2DView residual("",ncells,num_basis,N+1);
+  init_fad(wgb, wbs, flux, src, residual);
+
+  // Create memory pool for DFad
+  // The kernel allocates 2*N double's per warp on Cuda.  Approximate
+  // the maximum number of warps as the maximum concurrency / 32.
+  // Include a fudge factor of 1.2 since memory pool treats a block as full
+  // once it reaches 80% capacity
+  const size_t block_size = N*sizeof(double);
+  size_t nkernels = ExecSpace::concurrency()*2;
+  if (is_cuda_space<ExecSpace>::value)
+    nkernels /= 32;
+  const size_t mem_pool_size = static_cast<size_t>(1.2*nkernels*block_size);
+  const size_t superblock_size =
+    std::max<size_t>(nkernels / 100, 1) * block_size;
+  ExecSpace exec_space;
+  Sacado::createGlobalMemoryPool(exec_space, mem_pool_size,
+                                 block_size, block_size, superblock_size);
+
+  // Run once to warm up, complete any UVM transfers
+  run_dfad_hierarchical_team(flux, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_dfad_hierarchical_team(flux, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  // Destroy memory pool
+  Sacado::destroyGlobalMemoryPool(exec_space);
+
+  return time;
+}
+
+template <int N, typename ExecSpace>
+double time_dfad_hierarchical_team_scratch(
+  int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
+{
+  typedef Sacado::Fad::DFad<double> FadType;
+
+  typedef typename ExecSpace::array_layout DefaultLayout;
+  typedef Kokkos::LayoutContiguous<DefaultLayout> ContLayout;
+  typedef Kokkos::View<FadType****,ContLayout,ExecSpace> t_4DView;
+  typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
+  typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
+
+  t_4DView wgb("",ncells,num_basis,num_points,ndim,N+1);
+  t_3DView flux("",ncells,num_points,ndim,N+1);
+  t_3DView wbs("",ncells,num_basis,num_points,N+1);
+  t_2DView src("",ncells,num_points,N+1);
+  t_2DView residual("",ncells,num_basis,N+1);
+  init_fad(wgb, wbs, flux, src, residual);
+
+  // Run once to warm up, complete any UVM transfers
+  run_dfad_hierarchical_team_scratch(flux, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_dfad_hierarchical_team_scratch(flux, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+#define INST_FUNC_N_DEV(N,DEV) \
+  template double time_dfad_hierarchical_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
+  template double time_dfad_hierarchical_team_scratch< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
+
+#define INST_FUNC_DEV(DEV) \
+  INST_FUNC_N_DEV( fad_dim, DEV )
+
+#ifdef KOKKOS_ENABLE_SERIAL
+INST_FUNC_DEV(Kokkos::Serial)
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+INST_FUNC_DEV(Kokkos::OpenMP)
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+INST_FUNC_DEV(Kokkos::Threads)
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+INST_FUNC_DEV(Kokkos::Cuda)
+#endif
diff --git a/packages/sacado/test/performance/advection/advection_hierarchical_dfad.hpp b/packages/sacado/test/performance/advection/advection_hierarchical_dfad.hpp
new file mode 100644
index 000000000000..9933d1102f73
--- /dev/null
+++ b/packages/sacado/test/performance/advection/advection_hierarchical_dfad.hpp
@@ -0,0 +1,39 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#pragma once
+
+template <int N, typename ExecSpace>
+double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points,
+                                   int ndim, int ntrial, bool check);
+
+template <int N, typename ExecSpace>
+double time_dfad_hierarchical_team_scratch(int ncells, int num_basis,
+                                           int num_points, int ndim, int ntrial,
+                                           bool check);
diff --git a/packages/sacado/test/performance/advection/common.hpp b/packages/sacado/test/performance/advection/common.hpp
new file mode 100644
index 000000000000..3a8c76257050
--- /dev/null
+++ b/packages/sacado/test/performance/advection/common.hpp
@@ -0,0 +1,389 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#pragma once
+
+const int fad_dim = 50;
+typedef Sacado::Fad::SFad<double,fad_dim> SFadType;
+typedef Sacado::Fad::DFad<double> DFadType;
+
+template <typename ExecSpace>
+struct is_cuda_space {
+  static const bool value = false;
+};
+
+#ifdef KOKKOS_ENABLE_CUDA
+template <>
+struct is_cuda_space<Kokkos::Cuda> {
+  static const bool value = true;
+};
+#endif
+
+template <typename scalar>
+scalar
+generate_fad(const size_t n0, const size_t n1,
+             const size_t n2, const size_t n3, const int fad_size,
+             const size_t i0, const size_t i1,
+             const size_t i2, const size_t i3,
+             const int i_fad)
+{
+  const scalar x0 =    10.0 + scalar(n0) / scalar(i0+1);
+  const scalar x1 =   100.0 + scalar(n1) / scalar(i1+1);
+  const scalar x2 =  1000.0 + scalar(n2) / scalar(i2+1);
+  const scalar x3 = 10000.0 + scalar(n3) / scalar(i3+1);
+  const scalar x  = x0 + x1 + x2 + x3;
+  if (i_fad == fad_size)
+    return x;
+  const scalar x_fad = 1.0 + scalar(fad_size) / scalar(i_fad+1);
+  return x + x_fad;
+}
+
+template <typename V1, typename V2, typename V3, typename V4, typename V5>
+void init_fad(const V1& v1, const V2& v2, const V3& v3, const V4& v4,
+              const V5& v5)
+{
+  typedef typename V1::non_const_value_type::value_type scalar;
+
+  const int ncells     = v1.extent(0);
+  const int num_basis  = v1.extent(1);
+  const int num_points = v1.extent(2);
+  const int ndim       = v1.extent(3);
+  const int N          = Kokkos::dimension_scalar(v1)-1;
+
+  // Kokkos::deep_copy(typename V1::array_type(v1), 1.0);
+  // Kokkos::deep_copy(typename V2::array_type(v2), 2.0);
+  // Kokkos::deep_copy(typename V3::array_type(v3), 3.0);
+  // Kokkos::deep_copy(typename V4::array_type(v4), 4.0);
+
+  auto v1_h = Kokkos::create_mirror_view(v1);
+  auto v2_h = Kokkos::create_mirror_view(v2);
+  auto v3_h = Kokkos::create_mirror_view(v3);
+  auto v4_h = Kokkos::create_mirror_view(v4);
+  for (int cell=0; cell<ncells; ++cell) {
+    for (int basis=0; basis<num_basis; ++basis) {
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<ndim; ++dim) {
+          for (int i=0; i<N; ++i)
+            v1_h(cell,basis,qp,dim).fastAccessDx(i) =
+              generate_fad<scalar>(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,i);
+          v1_h(cell,basis,qp,dim).val() =
+            generate_fad<scalar>(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,N);
+        }
+        for (int i=0; i<N; ++i)
+          v2_h(cell,basis,qp).fastAccessDx(i) =
+            generate_fad<scalar>(ncells,num_basis,num_points,1,N,cell,basis,qp,0,i);
+        v2_h(cell,basis,qp).val() =
+          generate_fad<scalar>(ncells,num_basis,num_points,1,N,cell,basis,qp,0,N);
+      }
+    }
+    for (int qp=0; qp<num_points; ++qp) {
+      for (int dim=0; dim<ndim; ++dim) {
+        for (int i=0; i<N; ++i)
+          v3_h(cell,qp,dim).fastAccessDx(i) =
+            generate_fad<scalar>(ncells,1,num_points,ndim,N,cell,0,qp,dim,i);
+        v3_h(cell,qp,dim).val() =
+          generate_fad<scalar>(ncells,1,num_points,ndim,N,cell,0,qp,dim,N);
+      }
+        for (int i=0; i<N; ++i)
+          v4_h(cell,qp).fastAccessDx(i) =
+            generate_fad<scalar>(ncells,1,num_points,1,N,cell,0,qp,0,i);
+        v4_h(cell,qp).val() =
+          generate_fad<scalar>(ncells,1,num_points,1,N,cell,0,qp,0,N);
+    }
+  }
+
+  Kokkos::deep_copy( v1, v1_h );
+  Kokkos::deep_copy( v2, v2_h );
+  Kokkos::deep_copy( v3, v3_h );
+  Kokkos::deep_copy( v4, v4_h );
+
+  Kokkos::deep_copy(typename V5::array_type(v5), 0.0);
+}
+
+template <typename V1, typename V2, typename V3, typename V4, typename V5>
+void init_array(const V1& v1, const V2& v2, const V3& v3, const V4& v4,
+                const V5& v5)
+{
+  typedef typename V1::non_const_value_type scalar;
+
+  const int ncells     = v1.extent(0);
+  const int num_basis  = v1.extent(1);
+  const int num_points = v1.extent(2);
+  const int ndim       = v1.extent(3);
+  const int N          = v1.extent(4)-1;
+
+  // Kokkos::deep_copy(typename V1::array_type(v1), 1.0);
+  // Kokkos::deep_copy(typename V2::array_type(v2), 2.0);
+  // Kokkos::deep_copy(typename V3::array_type(v3), 3.0);
+  // Kokkos::deep_copy(typename V4::array_type(v4), 4.0);
+
+  auto v1_h = Kokkos::create_mirror_view(v1);
+  auto v2_h = Kokkos::create_mirror_view(v2);
+  auto v3_h = Kokkos::create_mirror_view(v3);
+  auto v4_h = Kokkos::create_mirror_view(v4);
+  for (int cell=0; cell<ncells; ++cell) {
+    for (int basis=0; basis<num_basis; ++basis) {
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<ndim; ++dim) {
+          for (int i=0; i<N; ++i)
+            v1_h(cell,basis,qp,dim,i) =
+              generate_fad<scalar>(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,i);
+          v1_h(cell,basis,qp,dim,N) =
+            generate_fad<scalar>(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,N);
+        }
+        for (int i=0; i<N; ++i)
+          v2_h(cell,basis,qp,i) =
+            generate_fad<scalar>(ncells,num_basis,num_points,1,N,cell,basis,qp,0,i);
+        v2_h(cell,basis,qp,N) =
+          generate_fad<scalar>(ncells,num_basis,num_points,1,N,cell,basis,qp,0,N);
+      }
+    }
+    for (int qp=0; qp<num_points; ++qp) {
+      for (int dim=0; dim<ndim; ++dim) {
+        for (int i=0; i<N; ++i)
+          v3_h(cell,qp,dim,i) =
+            generate_fad<scalar>(ncells,1,num_points,ndim,N,cell,0,qp,dim,i);
+        v3_h(cell,qp,dim,N) =
+          generate_fad<scalar>(ncells,1,num_points,ndim,N,cell,0,qp,dim,N);
+      }
+      for (int i=0; i<N; ++i)
+        v4_h(cell,qp,i) =
+          generate_fad<scalar>(ncells,1,num_points,1,N,cell,0,qp,0,i);
+      v4_h(cell,qp,N) =
+        generate_fad<scalar>(ncells,1,num_points,1,N,cell,0,qp,0,N);
+    }
+  }
+
+  Kokkos::deep_copy( v1, v1_h );
+  Kokkos::deep_copy( v2, v2_h );
+  Kokkos::deep_copy( v3, v3_h );
+  Kokkos::deep_copy( v4, v4_h );
+
+  Kokkos::deep_copy(typename V5::array_type(v5), 0.0);
+}
+
+template <typename View1, typename View2>
+typename std::enable_if< !Kokkos::is_view_fad<View2>::value, bool>::type
+check(const View1& v_gold, const View2& v, const double tol)
+{
+  // Copy to host
+  typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold);
+  typename View2::HostMirror v_h      = Kokkos::create_mirror_view(v);
+  Kokkos::deep_copy(v_gold_h, v_gold);
+  Kokkos::deep_copy(v_h, v);
+
+  typedef typename View1::value_type value_type;
+
+  const size_t n0 = v_gold_h.extent(0);
+  const size_t n1 = v_gold_h.extent(1);
+  const size_t n2 = v_gold_h.extent(2);
+
+  bool success = true;
+  for ( size_t i0 = 0 ; i0 < n0 ; ++i0 ) {
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+        value_type x_gold = v_gold_h(i0,i1,i2);
+        value_type x      = v_h(i0,i1,i2);
+        if (std::abs(x_gold-x) > tol*std::abs(x_gold)) {
+          std::cout << "Comparison failed!  x_gold("
+                    << i0 << "," << i1 << "," << i2 << ") = "
+                    << x_gold << " , x = " << x
+                    << std::endl;
+          success = false;
+        }
+      }
+    }
+  }
+
+  return success;
+}
+
+template <typename View1, typename View2>
+typename std::enable_if< Kokkos::is_view_fad<View2>::value, bool>::type
+check(const View1& v_gold, const View2& v, const double tol)
+{
+  // Copy to host
+  typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold);
+  typename View2::HostMirror v_h      = Kokkos::create_mirror_view(v);
+  Kokkos::deep_copy(v_gold_h, v_gold);
+  Kokkos::deep_copy(v_h, v);
+
+  typedef typename View1::value_type value_type;
+
+  const size_t n0 = v_gold_h.extent(0);
+  const size_t n1 = v_gold_h.extent(1);
+  const size_t n2 = v_gold_h.extent(2);
+
+  bool success = true;
+  for ( size_t i0 = 0 ; i0 < n0 ; ++i0 ) {
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+        value_type x_gold = v_gold_h(i0,i1,i2);
+        value_type x      = (i2 == n2-1) ? v_h(i0,i1).val() : v_h(i0,i1).dx(i2);
+        if (std::abs(x_gold-x) > tol*std::abs(x_gold)) {
+          std::cout << "Comparison failed!  x_gold("
+                    << i0 << "," << i1 << "," << i2 << ") = "
+                    << x_gold << " , x = " << x
+                    << std::endl;
+          success = false;
+        }
+      }
+    }
+  }
+
+  return success;
+}
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView>
+Kokkos::View<double***,typename FluxView::execution_space>
+compute_gold_residual(
+  const FluxView& flux, const WgbView& wgb, const SrcView& src,
+  const WbsView& wbs,
+  typename std::enable_if< Kokkos::is_view_fad<FluxView>::value>::type* = 0)
+{
+  typedef typename FluxView::execution_space execution_space;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+  const int N            = Kokkos::dimension_scalar(wgb)-1;
+
+  Kokkos::View<double***,typename FluxView::execution_space> residual(
+    "",num_cells,num_basis,N+1);
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>( 0,num_cells ),
+                       KOKKOS_LAMBDA (const size_t cell)
+  {
+    double value, value2;
+
+    // Value
+    for (int basis=0; basis<num_basis; ++basis) {
+      value = value2 = 0.0;
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim)
+          value += flux(cell,qp,dim).val()*wgb(cell,basis,qp,dim).val();
+        value2 += src(cell,qp).val()*wbs(cell,basis,qp).val();
+      }
+      residual(cell,basis,N) = value+value2;
+    }
+
+    // Derivatives
+    for (int k=0; k<N; ++k) {
+      for (int basis=0; basis<num_basis; ++basis) {
+        value = value2 = 0.0;
+        for (int qp=0; qp<num_points; ++qp) {
+          for (int dim=0; dim<num_dim; ++dim)
+            value +=
+              flux(cell,qp,dim).val()*wgb(cell,basis,qp,dim).dx(k) +
+              flux(cell,qp,dim).dx(k)*wgb(cell,basis,qp,dim).val();
+          value2 +=
+            src(cell,qp).val()*wbs(cell,basis,qp).dx(k) +
+            src(cell,qp).dx(k)*wbs(cell,basis,qp).val();
+        }
+        residual(cell,basis,k) = value+value2;
+      }
+    }
+  });
+
+  return residual;
+}
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView>
+Kokkos::View<double***,typename FluxView::execution_space>
+compute_gold_residual(
+  const FluxView& flux, const WgbView& wgb, const SrcView& src,
+  const WbsView& wbs,
+  typename std::enable_if< !Kokkos::is_view_fad<FluxView>::value>::type* = 0)
+{
+  typedef typename FluxView::execution_space execution_space;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+  const int N            = wgb.extent(4)-1;
+
+  Kokkos::View<double***,typename FluxView::execution_space> residual(
+    "",num_cells,num_basis,N+1);
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>( 0,num_cells ),
+                       KOKKOS_LAMBDA (const size_t cell)
+  {
+    double value, value2;
+
+    // Value
+    for (int basis=0; basis<num_basis; ++basis) {
+      value = value2 = 0.0;
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim)
+          value += flux(cell,qp,dim,N)*wgb(cell,basis,qp,dim,N);
+        value2 += src(cell,qp,N)*wbs(cell,basis,qp,N);
+      }
+      residual(cell,basis,N) = value+value2;
+    }
+
+    // Derivatives
+    for (int k=0; k<N; ++k) {
+      for (int basis=0; basis<num_basis; ++basis) {
+        value = value2 = 0.0;
+        for (int qp=0; qp<num_points; ++qp) {
+          for (int dim=0; dim<num_dim; ++dim)
+            value +=
+              flux(cell,qp,dim,N)*wgb(cell,basis,qp,dim,k) +
+              flux(cell,qp,dim,k)*wgb(cell,basis,qp,dim,N);
+          value2 +=
+            src(cell,qp,N)*wbs(cell,basis,qp,k) +
+            src(cell,qp,k)*wbs(cell,basis,qp,N);
+        }
+        residual(cell,basis,k) = value+value2;
+      }
+    }
+  });
+
+  return residual;
+}
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void check_residual(const FluxView& flux, const WgbView& wgb,
+                    const SrcView& src, const WbsView& wbs,
+                    const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+
+  // Generate gold residual
+  auto residual_gold = compute_gold_residual(flux, wgb, src, wbs);
+
+  // Compare residual and residual_gold
+  const double tol = 1.0e-14;
+  check(residual_gold, residual, tol);
+}
diff --git a/packages/sacado/test/performance/advection/driver.cpp b/packages/sacado/test/performance/advection/driver.cpp
new file mode 100644
index 000000000000..1812cc80ed0a
--- /dev/null
+++ b/packages/sacado/test/performance/advection/driver.cpp
@@ -0,0 +1,164 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+// A performance test that computes the derivative of a simple Kokkos kernel
+// using various Fad classes
+
+#include "Sacado.hpp"
+
+#include "advection.hpp"
+#include "advection_hierarchical.hpp"
+#include "advection_hierarchical_dfad.hpp"
+#include "common.hpp"
+
+#include "Teuchos_CommandLineProcessor.hpp"
+#include "Teuchos_StandardCatchMacros.hpp"
+
+template <typename ExecSpace>
+void run(const int cell_begin, const int cell_end, const int cell_step,
+         const int nbasis, const int npoint, const int ntrial, const bool check)
+{
+  const int ndim = 3;
+  printf("ncell %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", "flat sfad", "flat dfad", "dfad sc", "analytic", "const", "team", "hier sfad", "hier dfad", "h dfad sc");
+  for(int i=cell_begin; i<=cell_end; i+=cell_step) {
+    double sfad_flat = time_fad_flat<SFadType,fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double dfad_flat = time_fad_flat<DFadType,fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double dfad_scratch = time_fad_scratch<DFadType,fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double analytic = time_analytic_flat<fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double analytic_const = time_analytic_const<fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double analytic_team = time_analytic_team<fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double sfad_hierarchical = time_fad_hierarchical_team<fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double dfad_hierarchical = time_dfad_hierarchical_team<fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double dfad_hierarchical_scratch =
+      time_dfad_hierarchical_team_scratch<fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    printf("%5d %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e\n",i,sfad_flat,dfad_flat,dfad_scratch,analytic,analytic_const,analytic_team,sfad_hierarchical,dfad_hierarchical,dfad_hierarchical_scratch);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc,argv);
+
+  bool success = true;
+  try {
+
+    // Set up command line options
+    Teuchos::CommandLineProcessor clp(false);
+    clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
+#ifdef KOKKOS_ENABLE_SERIAL
+    bool serial = 0;
+    clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+    bool openmp = 0;
+    clp.setOption("openmp", "no-openmp", &openmp, "Whether to run OpenMP");
+#endif
+#ifdef KOKKOS_ENABLE_THREADS
+    bool threads = 0;
+    clp.setOption("threads", "no-threads", &threads, "Whether to run Threads");
+#endif
+#ifdef KOKKOS_ENABLE_CUDA
+    bool cuda = 0;
+    clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");
+#endif
+    bool print_config = false;
+    clp.setOption("print-config", "no-print-config", &print_config,
+                  "Whether to print Kokkos device configuration");
+    int cell_begin = 100;
+    clp.setOption("begin", &cell_begin, "Starting number of cells");
+    int cell_end = 8000;
+    clp.setOption("end", &cell_end, "Ending number of cells");
+    int cell_step = 100;
+    clp.setOption("step", &cell_step, "Cell increment");
+    int nbasis = 8;
+    clp.setOption("basis", &nbasis, "Number of basis functions");
+    int npoint = 8;
+    clp.setOption("point", &npoint, "Number of integration points");
+    int ntrial = 5;
+    clp.setOption("trial", &ntrial, "Number of trials");
+    bool check = false;
+    clp.setOption("check", "no-check", &check,
+                  "Check correctness of results");
+
+    // Parse options
+    switch (clp.parse(argc, argv)) {
+    case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED:
+      return 0;
+    case Teuchos::CommandLineProcessor::PARSE_ERROR:
+    case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION:
+      return 1;
+    case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL:
+      break;
+    }
+
+    if (print_config)
+      Kokkos::print_configuration(std::cout, true);
+
+#ifdef KOKKOS_ENABLE_SERIAL
+    if (serial) {
+      using Kokkos::Serial;
+      run<Serial>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
+    }
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+    if (openmp) {
+      using Kokkos::OpenMP;
+      run<OpenMP>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
+    }
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+    if (threads) {
+      using Kokkos::Threads;
+      run<Threads>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
+    }
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+    if (cuda) {
+      using Kokkos::Cuda;
+      run<Cuda>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
+    }
+#endif
+  }
+  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
+
+  Kokkos::finalize();
+
+  return !success;
+}
diff --git a/packages/sacado/test/performance/advection_const_basis/CMakeLists.txt b/packages/sacado/test/performance/advection_const_basis/CMakeLists.txt
new file mode 100644
index 000000000000..d2bd03b5b92f
--- /dev/null
+++ b/packages/sacado/test/performance/advection_const_basis/CMakeLists.txt
@@ -0,0 +1,22 @@
+ASSERT_DEFINED(PACKAGE_SOURCE_DIR CMAKE_CURRENT_SOURCE_DIR)
+
+INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+
+IF (Sacado_ENABLE_KokkosCore AND Sacado_ENABLE_TeuchosCore)
+
+  IF(NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.8")))
+
+    TRIBITS_ADD_EXECUTABLE(
+      FadAdvectionConstBasisHierarchicalTest
+      SOURCES common.hpp
+              advection.hpp advection.cpp
+              advection_hierarchical.hpp advection_hierarchical.cpp
+              advection_hierarchical_dfad.hpp advection_hierarchical_dfad.cpp
+              driver.cpp
+      COMM serial mpi
+      )
+
+  ENDIF()
+
+ENDIF()
diff --git a/packages/sacado/test/performance/advection_const_basis/advection.cpp b/packages/sacado/test/performance/advection_const_basis/advection.cpp
new file mode 100644
index 000000000000..cdf84c285ca3
--- /dev/null
+++ b/packages/sacado/test/performance/advection_const_basis/advection.cpp
@@ -0,0 +1,427 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#include "Sacado.hpp"
+#include "advection.hpp"
+#include "common.hpp"
+
+#include "impl/Kokkos_Timer.hpp"
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_fad_flat(const FluxView& flux, const WgbView& wgb,
+                  const SrcView& src, const WbsView& wbs,
+                  const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>( 0,num_cells ),
+                       KOKKOS_LAMBDA (const size_t cell)
+  {
+    scalar_type value, value2;
+    for (int basis=0; basis<num_basis; ++basis) {
+      value = 0.0;
+      value2 = 0.0;
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim)
+          value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
+        value2 += src(cell,qp)*wbs(cell,basis,qp);
+      }
+      residual(cell,basis) = value+value2;
+    }
+  });
+}
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_fad_scratch(const FluxView& flux, const WgbView& wgb,
+                     const SrcView& src, const WbsView& wbs,
+                     const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type team_member;
+  typedef Kokkos::View<scalar_type*, typename execution_space::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > tmp_scratch_type;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  const int vector_size  = 1;
+  const int team_size    = is_cuda_space<execution_space>::value ? 32 : 1;
+  const int fad_size     = Kokkos::dimension_scalar(residual);
+  const size_t range     = (num_cells+team_size-1)/team_size;
+  const size_t bytes     = 2*tmp_scratch_type::shmem_size(team_size,fad_size);
+  policy_type policy(range,team_size,vector_size);
+
+  Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)),
+                       KOKKOS_LAMBDA (const team_member& team)
+  {
+    const int team_rank = team.team_rank();
+    tmp_scratch_type value(team.team_scratch(0), team_size, fad_size);
+    tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size);
+    const size_t cell = team.league_rank()*team_size + team_rank;
+    if (cell < num_cells) {
+      for (int basis=0; basis<num_basis; ++basis) {
+        value(team_rank) = 0.0;
+        value2(team_rank) = 0.0;
+        for (int qp=0; qp<num_points; ++qp) {
+          for (int dim=0; dim<num_dim; ++dim)
+            value(team_rank) += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
+          value2(team_rank) += src(cell,qp)*wbs(cell,basis,qp);
+        }
+        residual(cell,basis) = value(team_rank)+value2(team_rank);
+      }
+    }
+  });
+}
+
+template<int N, typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_analytic_flat(const FluxView& flux, const WgbView& wgb,
+                       const SrcView& src, const WbsView& wbs,
+                       const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>( 0,num_cells ),
+                       KOKKOS_LAMBDA (const size_t cell)
+  {
+    scalar_type value[N+1],value2[N+1];
+    for (int basis=0; basis<num_basis; ++basis) {
+      for (int k=0; k<N+1; ++k) {
+        value[k] = 0.0;
+        value2[k] = 0.0;
+      }
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim) {
+          const scalar_type flux_val = flux(cell,qp,dim,N);
+          const scalar_type wgb_val = wgb(cell,basis,qp,dim);
+          value[N] += flux_val*wgb_val;
+          for(int k=0; k<N; k++)
+            value[k] += flux(cell,qp,dim,k)*wgb_val;
+        }
+        const scalar_type src_val = src(cell,qp,N);
+        const scalar_type wbs_val = wbs(cell,basis,qp);
+        value2[N] += src_val*wbs_val;
+        for(int k=0; k<N; k++)
+          value2[k] += src(cell,qp,k)*wbs_val;
+      }
+      for(int k=0; k<=N; k++)
+        residual(cell,basis,k) = value[k]+value2[k];
+    }
+  });
+}
+
+template<int N, typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_analytic_team(const FluxView& flux, const WgbView& wgb,
+                       const SrcView& src, const WbsView& wbs,
+                       const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type team_member;
+  typedef Kokkos::View<scalar_type[N+1], typename execution_space::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > tmp_scratch_type;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  const size_t bytes     = 2*tmp_scratch_type::shmem_size();
+  policy_type policy(num_cells,num_basis,32);
+  Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerThread(bytes)),
+                       KOKKOS_LAMBDA (const team_member& team)
+  {
+    tmp_scratch_type value(team.thread_scratch(0));
+    tmp_scratch_type value2(team.thread_scratch(0));
+    const size_t cell = team.league_rank();
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,num_basis),
+                         [&] (const int& basis)
+    {
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N+1),
+                           [&] (const int& k)
+      {
+        value(k) = 0;
+        value2(k) = 0;
+      });
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim) {
+          const scalar_type flux_val = flux(cell,qp,dim,N);
+          const scalar_type wgb_val = wgb(cell,basis,qp,dim);
+          Kokkos::single(Kokkos::PerThread(team), [&] () {
+            value[N] += flux_val*wgb_val;
+          });
+          Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N),
+                               [&] (const int& k)
+          {
+            value[k] += flux(cell,qp,dim,k)*wgb_val;
+          });
+        }
+        const scalar_type src_val = src(cell,qp,N);
+        const scalar_type wbs_val = wbs(cell,basis,qp);
+        Kokkos::single(Kokkos::PerThread(team), [&] () {
+          value2[N] += src_val*wbs_val;
+        });
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N),
+                             [&] (const int& k)
+        {
+          value2[k] += src(cell,qp,k)*wbs_val;
+        });
+      }
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,N+1),
+                           [&] (const int& k)
+      {
+        residual(cell,basis,k) = value[k]+value2[k];
+      });
+    });
+  });
+}
+
+template <typename FadType, int N, typename ExecSpace>
+double time_fad_flat(int ncells, int num_basis, int num_points, int ndim,
+                     int ntrial, bool check)
+{
+  typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
+  typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
+  typedef Kokkos::View<FadType***,ExecSpace> t_3DView;
+  typedef Kokkos::View<FadType**,ExecSpace> t_2DView;
+
+  t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
+  t_3DView_d wbs("",ncells,num_basis,num_points);
+  t_3DView flux("",ncells,num_points,ndim,N+1);
+  t_2DView src("",ncells,num_points,N+1);
+  t_2DView residual("",ncells,num_basis,N+1);
+  init_fad(wgb, wbs, flux, src, residual);
+
+  // Run once to warm up, complete any UVM transfers
+  run_fad_flat(flux, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_fad_flat(flux, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+template <typename FadType, int N, typename ExecSpace>
+double time_fad_scratch(int ncells, int num_basis, int num_points, int ndim,
+                        int ntrial, bool check)
+{
+  typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
+  typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
+  typedef Kokkos::View<FadType***,ExecSpace> t_3DView;
+  typedef Kokkos::View<FadType**,ExecSpace> t_2DView;
+
+  t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
+  t_3DView_d wbs("",ncells,num_basis,num_points);
+  t_3DView flux("",ncells,num_points,ndim,N+1);
+  t_2DView src("",ncells,num_points,N+1);
+  t_2DView residual("",ncells,num_basis,N+1);
+  init_fad(wgb, wbs, flux, src, residual);
+
+  // Run once to warm up, complete any UVM transfers
+  run_fad_scratch(flux, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_fad_scratch(flux, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+template <int N, typename ExecSpace>
+double time_analytic_flat(int ncells, int num_basis, int num_points, int ndim,
+                          int ntrial, bool check)
+{
+  typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
+  typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
+  typedef Kokkos::View<double***[N+1],ExecSpace> t_3DView;
+  typedef Kokkos::View<double**[N+1],ExecSpace> t_2DView;
+
+  t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
+  t_3DView_d wbs("",ncells,num_basis,num_points);
+  t_3DView flux("",ncells,num_points,ndim);
+  t_2DView src("",ncells,num_points);
+  t_2DView residual("",ncells,num_basis);
+  init_array(wgb, wbs, flux, src, residual);
+
+  // Run once to warm up, complete any UVM transfers
+  run_analytic_flat<N>(flux, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_analytic_flat<N>(flux, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+template <int N, typename ExecSpace>
+double time_analytic_const(int ncells, int num_basis, int num_points, int ndim,
+                           int ntrial, bool check)
+{
+  typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
+  typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
+  typedef Kokkos::View<double***[N+1],ExecSpace> t_3DView;
+  typedef Kokkos::View<double**[N+1],ExecSpace> t_2DView;
+
+  t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
+  t_3DView_d wbs("",ncells,num_basis,num_points);
+  t_3DView flux("",ncells,num_points,ndim);
+  t_2DView src("",ncells,num_points);
+  t_2DView residual("",ncells,num_basis);
+  init_array(wgb, wbs, flux, src, residual);
+
+  typedef Kokkos::View<const double***[N+1],ExecSpace,Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_3DView_const;
+  t_3DView_const flux_const = flux;
+
+  // Run once to warm up, complete any UVM transfers
+  run_analytic_flat<N>(flux_const, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_analytic_flat<N>(flux_const, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+template <int N, typename ExecSpace>
+double time_analytic_team(int ncells, int num_basis, int num_points, int ndim,
+                          int ntrial, bool check)
+{
+  typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
+  typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
+  typedef Kokkos::View<double***[N+1],ExecSpace> t_3DView;
+  typedef Kokkos::View<double**[N+1],ExecSpace> t_2DView;
+
+  t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
+  t_3DView_d wbs("",ncells,num_basis,num_points);
+  t_3DView flux("",ncells,num_points,ndim);
+  t_2DView src("",ncells,num_points);
+  t_2DView residual("",ncells,num_basis);
+  init_array(wgb, wbs, flux, src, residual);
+
+  typedef Kokkos::View<const double***[N+1],ExecSpace,Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_3DView_const;
+  t_3DView_const flux_const = flux;
+
+  // Run once to warm up, complete any UVM transfers
+  run_analytic_team<N>(flux_const, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_analytic_team<N>(flux_const, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+#define INST_FUNC_FAD_N_DEV(FAD,N,DEV) \
+  template double time_fad_flat< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
+  template double time_fad_scratch< FAD, N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
+
+#define INST_FUNC_N_DEV(N,DEV) \
+  INST_FUNC_FAD_N_DEV(SFadType,N,DEV) \
+  INST_FUNC_FAD_N_DEV(DFadType,N,DEV) \
+  template double time_analytic_flat< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
+  template double time_analytic_const< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
+  template double time_analytic_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
+
+#define INST_FUNC_DEV(DEV) \
+  INST_FUNC_N_DEV( fad_dim, DEV )
+
+#ifdef KOKKOS_ENABLE_SERIAL
+INST_FUNC_DEV(Kokkos::Serial)
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+INST_FUNC_DEV(Kokkos::OpenMP)
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+INST_FUNC_DEV(Kokkos::Threads)
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+INST_FUNC_DEV(Kokkos::Cuda)
+#endif
diff --git a/packages/sacado/test/performance/advection_const_basis/advection.hpp b/packages/sacado/test/performance/advection_const_basis/advection.hpp
new file mode 100644
index 000000000000..adca588cee11
--- /dev/null
+++ b/packages/sacado/test/performance/advection_const_basis/advection.hpp
@@ -0,0 +1,50 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#pragma once
+
+template <typename FadType, int N, typename ExecSpace>
+double time_fad_flat(int ncells, int num_basis, int num_points, int ndim,
+                     int ntrial, bool check);
+
+template <typename FadType, int N, typename ExecSpace>
+double time_fad_scratch(int ncells, int num_basis, int num_points, int ndim,
+                        int ntrial, bool check);
+
+template <int N, typename ExecSpace>
+double time_analytic_flat(int ncells, int num_basis, int num_points, int ndim,
+                          int ntrial, bool check);
+
+template <int N, typename ExecSpace>
+double time_analytic_const(int ncells, int num_basis, int num_points, int ndim,
+                           int ntrial, bool check);
+
+template <int N, typename ExecSpace>
+double time_analytic_team(int ncells, int num_basis, int num_points, int ndim,
+                          int ntrial, bool check);
diff --git a/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp
new file mode 100644
index 000000000000..40aa3082d35c
--- /dev/null
+++ b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp
@@ -0,0 +1,232 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#define SACADO_VIEW_CUDA_HIERARCHICAL 1
+#define SACADO_ALIGN_SFAD 1
+
+#include "Sacado.hpp"
+#include "advection_hierarchical.hpp"
+#include "common.hpp"
+
+#include "impl/Kokkos_Timer.hpp"
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_fad_hierarchical_flat(const FluxView& flux, const WgbView& wgb,
+                               const SrcView& src, const WbsView& wbs,
+                               const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+  typedef typename Kokkos::ThreadLocalScalarType<ResidualView>::type local_scalar_type;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type team_member;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  const bool is_cuda     = is_cuda_space<execution_space>::value;
+  const int vector_size  = is_cuda ? 32 : 1;
+  const int team_size    = is_cuda ? 256/vector_size : 1;
+  const size_t range     = (num_cells+team_size-1)/team_size;
+
+  policy_type policy(range,team_size,vector_size);
+  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team)
+  {
+    const size_t cell = team.league_rank()*team_size + team.team_rank();
+    local_scalar_type value, value2;
+    for (int basis=0; basis<num_basis; ++basis) {
+      value = 0.0;
+      value2 = 0.0;
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim)
+          value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
+        value2 += src(cell,qp)*wbs(cell,basis,qp);
+      }
+      residual(cell,basis) = value+value2;
+    }
+  });
+}
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_fad_hierarchical_team(const FluxView& flux, const WgbView& wgb,
+                               const SrcView& src, const WbsView& wbs,
+                               const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+  typedef typename Kokkos::ThreadLocalScalarType<ResidualView>::type local_scalar_type;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type team_member;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  const bool is_cuda     = is_cuda_space<execution_space>::value;
+  const int vector_size  = is_cuda ? 32 : 1;
+  const int team_size    = is_cuda ? 256/vector_size : 1;
+
+  policy_type policy(num_cells,team_size,vector_size);
+  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team)
+  {
+    const int team_rank = team.team_rank();
+    const size_t cell = team.league_rank();
+    local_scalar_type value, value2;
+    for (int basis=team_rank; basis<num_basis; basis+=team_size) {
+      value = 0.0;
+      value2 = 0.0;
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim)
+          value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
+        value2 += src(cell,qp)*wbs(cell,basis,qp);
+      }
+      residual(cell,basis) = value+value2;
+    }
+  });
+}
+
+template <int N, typename ExecSpace>
+double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points,
+                                  int ndim, int ntrial, bool check)
+{
+  typedef Sacado::Fad::SFad<double,N> FadType;
+
+  static const int FadStride = is_cuda_space<ExecSpace>::value ? 32 : 1;
+#if defined(SACADO_ALIGN_SFAD)
+  static const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
+  typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
+#else
+  typedef FadType AlignedFadType;
+#endif
+
+  typedef typename ExecSpace::array_layout DefaultLayout;
+  typedef Kokkos::LayoutContiguous<DefaultLayout,FadStride> ContLayout;
+  typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
+  typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
+  typedef Kokkos::View<AlignedFadType***,ContLayout,ExecSpace> t_3DView;
+  typedef Kokkos::View<AlignedFadType**,ContLayout,ExecSpace> t_2DView;
+
+  t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
+  t_3DView_d wbs("",ncells,num_basis,num_points);
+  t_3DView flux("",ncells,num_points,ndim,N+1);
+  t_2DView src("",ncells,num_points,N+1);
+  t_2DView residual("",ncells,num_basis,N+1);
+  init_fad(wgb, wbs, flux, src, residual);
+
+  // Run once to warm up, complete any UVM transfers
+  run_fad_hierarchical_flat(flux, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_fad_hierarchical_flat(flux, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+template <int N, typename ExecSpace>
+double time_fad_hierarchical_team(int ncells, int num_basis, int num_points,
+                                  int ndim, int ntrial, bool check)
+{
+  typedef Sacado::Fad::SFad<double,N> FadType;
+
+  static const int FadStride = is_cuda_space<ExecSpace>::value ? 32 : 1;
+#if defined(SACADO_ALIGN_SFAD)
+  static const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
+  typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
+#else
+  typedef FadType AlignedFadType;
+#endif
+
+  typedef typename ExecSpace::array_layout DefaultLayout;
+  typedef Kokkos::LayoutContiguous<DefaultLayout,FadStride> ContLayout;
+  typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
+  typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
+  typedef Kokkos::View<AlignedFadType***,ContLayout,ExecSpace> t_3DView;
+  typedef Kokkos::View<AlignedFadType**,ContLayout,ExecSpace> t_2DView;
+
+  t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
+  t_3DView_d wbs("",ncells,num_basis,num_points);
+  t_3DView flux("",ncells,num_points,ndim,N+1);
+  t_2DView src("",ncells,num_points,N+1);
+  t_2DView residual("",ncells,num_basis,N+1);
+  init_fad(wgb, wbs, flux, src, residual);
+
+  // Run once to warm up, complete any UVM transfers
+  run_fad_hierarchical_team(flux, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_fad_hierarchical_team(flux, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+#define INST_FUNC_N_DEV(N,DEV) \
+  template double time_fad_hierarchical_flat< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
+  template double time_fad_hierarchical_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
+
+#define INST_FUNC_DEV(DEV) \
+  INST_FUNC_N_DEV( fad_dim, DEV )
+
+#ifdef KOKKOS_ENABLE_SERIAL
+INST_FUNC_DEV(Kokkos::Serial)
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+INST_FUNC_DEV(Kokkos::OpenMP)
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+INST_FUNC_DEV(Kokkos::Threads)
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+INST_FUNC_DEV(Kokkos::Cuda)
+#endif
diff --git a/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.hpp b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.hpp
new file mode 100644
index 000000000000..f78f4316897b
--- /dev/null
+++ b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.hpp
@@ -0,0 +1,38 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#pragma once
+
+template <int N, typename ExecSpace>
+double time_fad_hierarchical_flat(int ncells, int num_basis, int num_points,
+                                  int ndim, int ntrial, bool check);
+
+template <int N, typename ExecSpace>
+double time_fad_hierarchical_team(int ncells, int num_basis, int num_points,
+                                  int ndim, int ntrial, bool check);
diff --git a/packages/sacado/test/performance/advection_const_basis/advection_hierarchical_dfad.cpp b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical_dfad.cpp
new file mode 100644
index 000000000000..980e9b6eec82
--- /dev/null
+++ b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical_dfad.cpp
@@ -0,0 +1,239 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
+#define SACADO_KOKKOS_USE_MEMORY_POOL 1
+
+#include "Sacado.hpp"
+#include "advection_hierarchical_dfad.hpp"
+#include "common.hpp"
+
+#include "impl/Kokkos_Timer.hpp"
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_dfad_hierarchical_team(const FluxView& flux, const WgbView& wgb,
+                                const SrcView& src, const WbsView& wbs,
+                                const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type team_member;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  const bool is_cuda     = is_cuda_space<execution_space>::value;
+  const int vector_size  = is_cuda ? 32 : 1;
+  const int team_size    = is_cuda ? 256/vector_size : 1;
+
+  policy_type policy(num_cells,team_size,vector_size);
+  Kokkos::parallel_for(policy, KOKKOS_LAMBDA (const team_member& team)
+  {
+    const int team_rank = team.team_rank();
+    const size_t cell = team.league_rank();
+    scalar_type value, value2;
+    for (int basis=team_rank; basis<num_basis; basis+=team_size) {
+      value = 0.0;
+      value2 = 0.0;
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim)
+          value += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
+        value2 += src(cell,qp)*wbs(cell,basis,qp);
+      }
+      residual(cell,basis) = value+value2;
+    }
+  });
+}
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void run_dfad_hierarchical_team_scratch(
+  const FluxView& flux, const WgbView& wgb,
+  const SrcView& src, const WbsView& wbs,
+  const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+  typedef typename ResidualView::non_const_value_type scalar_type;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type team_member;
+  typedef Kokkos::View<scalar_type*, typename execution_space::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > tmp_scratch_type;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+
+  const bool is_cuda     = is_cuda_space<execution_space>::value;
+  const int vector_size  = is_cuda ? 32 : 1;
+  const int team_size    = is_cuda ? 256/vector_size : 1;
+  const int fad_size     = Kokkos::dimension_scalar(residual);
+  const size_t bytes     = 2*tmp_scratch_type::shmem_size(team_size,fad_size);
+  policy_type policy(num_cells,team_size,vector_size);
+
+  Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(bytes)),
+                       KOKKOS_LAMBDA (const team_member& team)
+  {
+    const int team_rank = team.team_rank();
+    const size_t cell = team.league_rank();
+    tmp_scratch_type value(team.team_scratch(0), team_size, fad_size);
+    tmp_scratch_type value2(team.team_scratch(0), team_size, fad_size);
+    for (int basis=team_rank; basis<num_basis; basis+=team_size) {
+      value(team_rank) = 0.0;
+      value2(team_rank) = 0.0;
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim)
+          value(team_rank) += flux(cell,qp,dim)*wgb(cell,basis,qp,dim);
+        value2(team_rank) += src(cell,qp)*wbs(cell,basis,qp);
+      }
+      residual(cell,basis) = value(team_rank)+value2(team_rank);
+    }
+  });
+}
+
+template <int N, typename ExecSpace>
+double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points,
+                                   int ndim, int ntrial, bool check)
+{
+  typedef Sacado::Fad::DFad<double> FadType;
+
+  typedef typename ExecSpace::array_layout DefaultLayout;
+  typedef Kokkos::LayoutContiguous<DefaultLayout> ContLayout;
+  typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
+  typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
+  typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
+  typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
+
+  t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
+  t_3DView_d wbs("",ncells,num_basis,num_points);
+  t_3DView flux("",ncells,num_points,ndim,N+1);
+  t_2DView src("",ncells,num_points,N+1);
+  t_2DView residual("",ncells,num_basis,N+1);
+  init_fad(wgb, wbs, flux, src, residual);
+
+  // Create memory pool for DFad
+  // The kernel allocates 2*N double's per warp on Cuda.  Approximate
+  // the maximum number of warps as the maximum concurrency / 32.
+  // Include a fudge factor of 1.2 since memory pool treats a block as full
+  // once it reaches 80% capacity
+  const size_t block_size = N*sizeof(double);
+  size_t nkernels = ExecSpace::concurrency()*2;
+  if (is_cuda_space<ExecSpace>::value)
+    nkernels /= 32;
+  const size_t mem_pool_size = static_cast<size_t>(1.2*nkernels*block_size);
+  const size_t superblock_size =
+    std::max<size_t>(nkernels / 100, 1) * block_size;
+  ExecSpace exec_space;
+  Sacado::createGlobalMemoryPool(exec_space, mem_pool_size,
+                                 block_size, block_size, superblock_size);
+
+  // Run once to warm up, complete any UVM transfers
+  run_dfad_hierarchical_team(flux, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_dfad_hierarchical_team(flux, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  // Destroy memory pool
+  Sacado::destroyGlobalMemoryPool(exec_space);
+
+  return time;
+}
+
+template <int N, typename ExecSpace>
+double time_dfad_hierarchical_team_scratch(
+  int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check)
+{
+  typedef Sacado::Fad::DFad<double> FadType;
+
+  typedef typename ExecSpace::array_layout DefaultLayout;
+  typedef Kokkos::LayoutContiguous<DefaultLayout> ContLayout;
+  typedef Kokkos::View<double****,ExecSpace> t_4DView_d;
+  typedef Kokkos::View<double***,ExecSpace> t_3DView_d;
+  typedef Kokkos::View<FadType***,ContLayout,ExecSpace> t_3DView;
+  typedef Kokkos::View<FadType**,ContLayout,ExecSpace> t_2DView;
+
+  t_4DView_d wgb("",ncells,num_basis,num_points,ndim);
+  t_3DView_d wbs("",ncells,num_basis,num_points);
+  t_3DView flux("",ncells,num_points,ndim,N+1);
+  t_2DView src("",ncells,num_points,N+1);
+  t_2DView residual("",ncells,num_basis,N+1);
+  init_fad(wgb, wbs, flux, src, residual);
+
+  // Run once to warm up, complete any UVM transfers
+  run_dfad_hierarchical_team_scratch(flux, wgb, src, wbs, residual);
+
+  // Time execution
+  Kokkos::fence();
+  Kokkos::Impl::Timer timer;
+  for (int i=0; i<ntrial; ++i)
+    run_dfad_hierarchical_team_scratch(flux, wgb, src, wbs, residual);
+  Kokkos::fence();
+  double time = timer.seconds() / ntrial / ncells;
+
+  // Check result
+  if (check)
+    check_residual(flux, wgb, src, wbs, residual);
+
+  return time;
+}
+
+#define INST_FUNC_N_DEV(N,DEV) \
+  template double time_dfad_hierarchical_team< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check); \
+  template double time_dfad_hierarchical_team_scratch< N, DEV >(int ncells, int num_basis, int num_points, int ndim, int ntrial, bool check);
+
+#define INST_FUNC_DEV(DEV) \
+  INST_FUNC_N_DEV( fad_dim, DEV )
+
+#ifdef KOKKOS_ENABLE_SERIAL
+INST_FUNC_DEV(Kokkos::Serial)
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+INST_FUNC_DEV(Kokkos::OpenMP)
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+INST_FUNC_DEV(Kokkos::Threads)
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+INST_FUNC_DEV(Kokkos::Cuda)
+#endif
diff --git a/packages/sacado/test/performance/advection_const_basis/advection_hierarchical_dfad.hpp b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical_dfad.hpp
new file mode 100644
index 000000000000..9933d1102f73
--- /dev/null
+++ b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical_dfad.hpp
@@ -0,0 +1,39 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#pragma once
+
+template <int N, typename ExecSpace>
+double time_dfad_hierarchical_team(int ncells, int num_basis, int num_points,
+                                   int ndim, int ntrial, bool check);
+
+template <int N, typename ExecSpace>
+double time_dfad_hierarchical_team_scratch(int ncells, int num_basis,
+                                           int num_points, int ndim, int ntrial,
+                                           bool check);
diff --git a/packages/sacado/test/performance/advection_const_basis/common.hpp b/packages/sacado/test/performance/advection_const_basis/common.hpp
new file mode 100644
index 000000000000..4f1819ae4c06
--- /dev/null
+++ b/packages/sacado/test/performance/advection_const_basis/common.hpp
@@ -0,0 +1,350 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#pragma once
+
+const int fad_dim = 50;
+typedef Sacado::Fad::SFad<double,fad_dim> SFadType;
+typedef Sacado::Fad::DFad<double> DFadType;
+
+template <typename ExecSpace>
+struct is_cuda_space {
+  static const bool value = false;
+};
+
+#ifdef KOKKOS_ENABLE_CUDA
+template <>
+struct is_cuda_space<Kokkos::Cuda> {
+  static const bool value = true;
+};
+#endif
+
+template <typename scalar>
+scalar
+generate_fad(const size_t n0, const size_t n1,
+             const size_t n2, const size_t n3, const int fad_size,
+             const size_t i0, const size_t i1,
+             const size_t i2, const size_t i3,
+             const int i_fad)
+{
+  const scalar x0 =    10.0 + scalar(n0) / scalar(i0+1);
+  const scalar x1 =   100.0 + scalar(n1) / scalar(i1+1);
+  const scalar x2 =  1000.0 + scalar(n2) / scalar(i2+1);
+  const scalar x3 = 10000.0 + scalar(n3) / scalar(i3+1);
+  const scalar x  = x0 + x1 + x2 + x3;
+  if (i_fad == fad_size)
+    return x;
+  const scalar x_fad = 1.0 + scalar(fad_size) / scalar(i_fad+1);
+  return x + x_fad;
+}
+
+template <typename WgbView, typename WbsView, typename FluxView,
+          typename SrcView, typename ResidualView>
+void init_fad(const WgbView& wgb, const WbsView& wbs, const FluxView& flux,
+              const SrcView& src, const ResidualView& residual)
+{
+  typedef typename ResidualView::non_const_value_type::value_type scalar;
+
+  const int ncells     = wgb.extent(0);
+  const int num_basis  = wgb.extent(1);
+  const int num_points = wgb.extent(2);
+  const int ndim       = wgb.extent(3);
+  const int N          = Kokkos::dimension_scalar(residual)-1;
+
+  auto wgb_h = Kokkos::create_mirror_view(wgb);
+  auto wbs_h = Kokkos::create_mirror_view(wbs);
+  auto flux_h = Kokkos::create_mirror_view(flux);
+  auto src_h = Kokkos::create_mirror_view(src);
+  for (int cell=0; cell<ncells; ++cell) {
+    for (int basis=0; basis<num_basis; ++basis) {
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<ndim; ++dim) {
+          wgb_h(cell,basis,qp,dim) =
+            generate_fad<scalar>(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,N);
+        }
+        wbs_h(cell,basis,qp) =
+          generate_fad<scalar>(ncells,num_basis,num_points,1,N,cell,basis,qp,0,N);
+      }
+    }
+    for (int qp=0; qp<num_points; ++qp) {
+      for (int dim=0; dim<ndim; ++dim) {
+        for (int i=0; i<N; ++i)
+          flux_h(cell,qp,dim).fastAccessDx(i) =
+            generate_fad<scalar>(ncells,1,num_points,ndim,N,cell,0,qp,dim,i);
+        flux_h(cell,qp,dim).val() =
+          generate_fad<scalar>(ncells,1,num_points,ndim,N,cell,0,qp,dim,N);
+      }
+        for (int i=0; i<N; ++i)
+          src_h(cell,qp).fastAccessDx(i) =
+            generate_fad<scalar>(ncells,1,num_points,1,N,cell,0,qp,0,i);
+        src_h(cell,qp).val() =
+          generate_fad<scalar>(ncells,1,num_points,1,N,cell,0,qp,0,N);
+    }
+  }
+
+  Kokkos::deep_copy( wgb, wgb_h );
+  Kokkos::deep_copy( wbs, wbs_h );
+  Kokkos::deep_copy( flux, flux_h );
+  Kokkos::deep_copy( src, src_h );
+
+  Kokkos::deep_copy(typename ResidualView::array_type(residual), 0.0);
+}
+
+template <typename WgbView, typename WbsView, typename FluxView,
+          typename SrcView, typename ResidualView>
+void init_array(const WgbView& wgb, const WbsView& wbs, const FluxView& flux,
+                const SrcView& src, const ResidualView& residual)
+{
+  typedef typename ResidualView::non_const_value_type scalar;
+
+  const int ncells     = wgb.extent(0);
+  const int num_basis  = wgb.extent(1);
+  const int num_points = wgb.extent(2);
+  const int ndim       = wgb.extent(3);
+  const int N          = residual.extent(2)-1;
+
+  auto wgb_h = Kokkos::create_mirror_view(wgb);
+  auto wbs_h = Kokkos::create_mirror_view(wbs);
+  auto flux_h = Kokkos::create_mirror_view(flux);
+  auto src_h = Kokkos::create_mirror_view(src);
+  for (int cell=0; cell<ncells; ++cell) {
+    for (int basis=0; basis<num_basis; ++basis) {
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<ndim; ++dim) {
+          wgb_h(cell,basis,qp,dim) =
+            generate_fad<scalar>(ncells,num_basis,num_points,ndim,N,cell,basis,qp,dim,N);
+        }
+        wbs_h(cell,basis,qp) =
+          generate_fad<scalar>(ncells,num_basis,num_points,1,N,cell,basis,qp,0,N);
+      }
+    }
+    for (int qp=0; qp<num_points; ++qp) {
+      for (int dim=0; dim<ndim; ++dim) {
+        for (int i=0; i<N; ++i)
+          flux_h(cell,qp,dim,i) =
+            generate_fad<scalar>(ncells,1,num_points,ndim,N,cell,0,qp,dim,i);
+        flux_h(cell,qp,dim,N) =
+          generate_fad<scalar>(ncells,1,num_points,ndim,N,cell,0,qp,dim,N);
+      }
+      for (int i=0; i<N; ++i)
+        src_h(cell,qp,i) =
+          generate_fad<scalar>(ncells,1,num_points,1,N,cell,0,qp,0,i);
+      src_h(cell,qp,N) =
+        generate_fad<scalar>(ncells,1,num_points,1,N,cell,0,qp,0,N);
+    }
+  }
+
+  Kokkos::deep_copy( wgb, wgb_h );
+  Kokkos::deep_copy( wbs, wbs_h );
+  Kokkos::deep_copy( flux, flux_h );
+  Kokkos::deep_copy( src, src_h );
+
+  Kokkos::deep_copy(residual, 0.0);
+}
+
+template <typename View1, typename View2>
+typename std::enable_if< !Kokkos::is_view_fad<View2>::value, bool>::type
+check(const View1& v_gold, const View2& v, const double tol)
+{
+  // Copy to host
+  typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold);
+  typename View2::HostMirror v_h      = Kokkos::create_mirror_view(v);
+  Kokkos::deep_copy(v_gold_h, v_gold);
+  Kokkos::deep_copy(v_h, v);
+
+  typedef typename View1::value_type value_type;
+
+  const size_t n0 = v_gold_h.extent(0);
+  const size_t n1 = v_gold_h.extent(1);
+  const size_t n2 = v_gold_h.extent(2);
+
+  bool success = true;
+  for ( size_t i0 = 0 ; i0 < n0 ; ++i0 ) {
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+        value_type x_gold = v_gold_h(i0,i1,i2);
+        value_type x      = v_h(i0,i1,i2);
+        if (std::abs(x_gold-x) > tol*std::abs(x_gold)) {
+          std::cout << "Comparison failed!  x_gold("
+                    << i0 << "," << i1 << "," << i2 << ") = "
+                    << x_gold << " , x = " << x
+                    << std::endl;
+          success = false;
+        }
+      }
+    }
+  }
+
+  return success;
+}
+
+template <typename View1, typename View2>
+typename std::enable_if< Kokkos::is_view_fad<View2>::value, bool>::type
+check(const View1& v_gold, const View2& v, const double tol)
+{
+  // Copy to host
+  typename View1::HostMirror v_gold_h = Kokkos::create_mirror_view(v_gold);
+  typename View2::HostMirror v_h      = Kokkos::create_mirror_view(v);
+  Kokkos::deep_copy(v_gold_h, v_gold);
+  Kokkos::deep_copy(v_h, v);
+
+  typedef typename View1::value_type value_type;
+
+  const size_t n0 = v_gold_h.extent(0);
+  const size_t n1 = v_gold_h.extent(1);
+  const size_t n2 = v_gold_h.extent(2);
+
+  bool success = true;
+  for ( size_t i0 = 0 ; i0 < n0 ; ++i0 ) {
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+      for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+        value_type x_gold = v_gold_h(i0,i1,i2);
+        value_type x      = (i2 == n2-1) ? v_h(i0,i1).val() : v_h(i0,i1).dx(i2);
+        if (std::abs(x_gold-x) > tol*std::abs(x_gold)) {
+          std::cout << "Comparison failed!  x_gold("
+                    << i0 << "," << i1 << "," << i2 << ") = "
+                    << x_gold << " , x = " << x
+                    << std::endl;
+          success = false;
+        }
+      }
+    }
+  }
+
+  return success;
+}
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView>
+Kokkos::View<double***,typename FluxView::execution_space>
+compute_gold_residual(
+  const FluxView& flux, const WgbView& wgb, const SrcView& src,
+  const WbsView& wbs,
+  typename std::enable_if< Kokkos::is_view_fad<FluxView>::value>::type* = 0)
+{
+  typedef typename FluxView::execution_space execution_space;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+  const int N            = Kokkos::dimension_scalar(flux)-1;
+
+  Kokkos::View<double***,typename FluxView::execution_space> residual(
+    "",num_cells,num_basis,N+1);
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>( 0,num_cells ),
+                       KOKKOS_LAMBDA (const size_t cell)
+  {
+    double value, value2;
+
+    // Value
+    for (int basis=0; basis<num_basis; ++basis) {
+      value = value2 = 0.0;
+      for (int qp=0; qp<num_points; ++qp) {
+        for (int dim=0; dim<num_dim; ++dim)
+          value += flux(cell,qp,dim).val()*wgb(cell,basis,qp,dim);
+        value2 += src(cell,qp).val()*wbs(cell,basis,qp);
+      }
+      residual(cell,basis,N) = value+value2;
+    }
+
+    // Derivatives
+    for (int k=0; k<N; ++k) {
+      for (int basis=0; basis<num_basis; ++basis) {
+        value = value2 = 0.0;
+        for (int qp=0; qp<num_points; ++qp) {
+          for (int dim=0; dim<num_dim; ++dim)
+            value +=
+              flux(cell,qp,dim).dx(k)*wgb(cell,basis,qp,dim);
+          value2 +=
+            src(cell,qp).dx(k)*wbs(cell,basis,qp);
+        }
+        residual(cell,basis,k) = value+value2;
+      }
+    }
+  });
+
+  return residual;
+}
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView>
+Kokkos::View<double***,typename FluxView::execution_space>
+compute_gold_residual(
+  const FluxView& flux, const WgbView& wgb, const SrcView& src,
+  const WbsView& wbs,
+  typename std::enable_if< !Kokkos::is_view_fad<FluxView>::value>::type* = 0)
+{
+  typedef typename FluxView::execution_space execution_space;
+
+  const size_t num_cells = wgb.extent(0);
+  const int num_basis    = wgb.extent(1);
+  const int num_points   = wgb.extent(2);
+  const int num_dim      = wgb.extent(3);
+  const int N            = flux.extent(3)-1;
+
+  Kokkos::View<double***,typename FluxView::execution_space> residual(
+    "",num_cells,num_basis,N+1);
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>( 0,num_cells ),
+                       KOKKOS_LAMBDA (const size_t cell)
+  {
+    double value, value2;
+    for (int k=0; k<=N; ++k) {
+      for (int basis=0; basis<num_basis; ++basis) {
+        value = value2 = 0.0;
+        for (int qp=0; qp<num_points; ++qp) {
+          for (int dim=0; dim<num_dim; ++dim)
+            value += flux(cell,qp,dim,k)*wgb(cell,basis,qp,dim);
+          value2 += src(cell,qp,k)*wbs(cell,basis,qp);
+        }
+        residual(cell,basis,k) = value+value2;
+      }
+    }
+  });
+
+  return residual;
+}
+
+template<typename FluxView, typename WgbView, typename SrcView,
+         typename WbsView, typename ResidualView>
+void check_residual(const FluxView& flux, const WgbView& wgb,
+                    const SrcView& src, const WbsView& wbs,
+                    const ResidualView& residual)
+{
+  typedef typename ResidualView::execution_space execution_space;
+
+  // Generate gold residual
+  auto residual_gold = compute_gold_residual(flux, wgb, src, wbs);
+
+  // Compare residual and residual_gold
+  const double tol = 1.0e-14;
+  check(residual_gold, residual, tol);
+}
diff --git a/packages/sacado/test/performance/advection_const_basis/driver.cpp b/packages/sacado/test/performance/advection_const_basis/driver.cpp
new file mode 100644
index 000000000000..1812cc80ed0a
--- /dev/null
+++ b/packages/sacado/test/performance/advection_const_basis/driver.cpp
@@ -0,0 +1,164 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+// A performance test that computes the derivative of a simple Kokkos kernel
+// using various Fad classes
+
+#include "Sacado.hpp"
+
+#include "advection.hpp"
+#include "advection_hierarchical.hpp"
+#include "advection_hierarchical_dfad.hpp"
+#include "common.hpp"
+
+#include "Teuchos_CommandLineProcessor.hpp"
+#include "Teuchos_StandardCatchMacros.hpp"
+
+template <typename ExecSpace>
+void run(const int cell_begin, const int cell_end, const int cell_step,
+         const int nbasis, const int npoint, const int ntrial, const bool check)
+{
+  const int ndim = 3;
+  printf("ncell %12s %12s %12s %12s %12s %12s %12s %12s %12s\n", "flat sfad", "flat dfad", "dfad sc", "analytic", "const", "team", "hier sfad", "hier dfad", "h dfad sc");
+  for(int i=cell_begin; i<=cell_end; i+=cell_step) {
+    double sfad_flat = time_fad_flat<SFadType,fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double dfad_flat = time_fad_flat<DFadType,fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double dfad_scratch = time_fad_scratch<DFadType,fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double analytic = time_analytic_flat<fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double analytic_const = time_analytic_const<fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double analytic_team = time_analytic_team<fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double sfad_hierarchical = time_fad_hierarchical_team<fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double dfad_hierarchical = time_dfad_hierarchical_team<fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    double dfad_hierarchical_scratch =
+      time_dfad_hierarchical_team_scratch<fad_dim,ExecSpace>(
+      i,nbasis,npoint,ndim,ntrial,check);
+    printf("%5d %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e %12.3e\n",i,sfad_flat,dfad_flat,dfad_scratch,analytic,analytic_const,analytic_team,sfad_hierarchical,dfad_hierarchical,dfad_hierarchical_scratch);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc,argv);
+
+  bool success = true;
+  try {
+
+    // Set up command line options
+    Teuchos::CommandLineProcessor clp(false);
+    clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
+#ifdef KOKKOS_ENABLE_SERIAL
+    bool serial = 0;
+    clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+    bool openmp = 0;
+    clp.setOption("openmp", "no-openmp", &openmp, "Whether to run OpenMP");
+#endif
+#ifdef KOKKOS_ENABLE_THREADS
+    bool threads = 0;
+    clp.setOption("threads", "no-threads", &threads, "Whether to run Threads");
+#endif
+#ifdef KOKKOS_ENABLE_CUDA
+    bool cuda = 0;
+    clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");
+#endif
+    bool print_config = false;
+    clp.setOption("print-config", "no-print-config", &print_config,
+                  "Whether to print Kokkos device configuration");
+    int cell_begin = 100;
+    clp.setOption("begin", &cell_begin, "Starting number of cells");
+    int cell_end = 8000;
+    clp.setOption("end", &cell_end, "Ending number of cells");
+    int cell_step = 100;
+    clp.setOption("step", &cell_step, "Cell increment");
+    int nbasis = 8;
+    clp.setOption("basis", &nbasis, "Number of basis functions");
+    int npoint = 8;
+    clp.setOption("point", &npoint, "Number of integration points");
+    int ntrial = 5;
+    clp.setOption("trial", &ntrial, "Number of trials");
+    bool check = false;
+    clp.setOption("check", "no-check", &check,
+                  "Check correctness of results");
+
+    // Parse options
+    switch (clp.parse(argc, argv)) {
+    case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED:
+      return 0;
+    case Teuchos::CommandLineProcessor::PARSE_ERROR:
+    case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION:
+      return 1;
+    case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL:
+      break;
+    }
+
+    if (print_config)
+      Kokkos::print_configuration(std::cout, true);
+
+#ifdef KOKKOS_ENABLE_SERIAL
+    if (serial) {
+      using Kokkos::Serial;
+      run<Serial>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
+    }
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+    if (openmp) {
+      using Kokkos::OpenMP;
+      run<OpenMP>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
+    }
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+    if (threads) {
+      using Kokkos::Threads;
+      run<Threads>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
+    }
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+    if (cuda) {
+      using Kokkos::Cuda;
+      run<Cuda>(cell_begin, cell_end, cell_step, nbasis, npoint, ntrial, check);
+    }
+#endif
+  }
+  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
+
+  Kokkos::finalize();
+
+  return !success;
+}
diff --git a/packages/sacado/test/performance/mat_vec/CMakeLists.txt b/packages/sacado/test/performance/mat_vec/CMakeLists.txt
new file mode 100644
index 000000000000..34a887346a65
--- /dev/null
+++ b/packages/sacado/test/performance/mat_vec/CMakeLists.txt
@@ -0,0 +1,22 @@
+ASSERT_DEFINED(PACKAGE_SOURCE_DIR CMAKE_CURRENT_SOURCE_DIR)
+
+INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+
+IF (Sacado_ENABLE_KokkosCore AND Sacado_ENABLE_TeuchosCore)
+
+  IF(NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.8")))
+
+    TRIBITS_ADD_EXECUTABLE(
+      FadMatVecTest
+      SOURCES common.hpp
+              mat_vec.hpp mat_vec.cpp
+              mat_vec_hierarchical.hpp mat_vec_hierarchical.cpp
+              mat_vec_hierarchical_dfad.hpp mat_vec_hierarchical_dfad.cpp
+              driver.cpp
+      COMM serial mpi
+      )
+
+  ENDIF()
+
+ENDIF()
diff --git a/packages/sacado/test/performance/mat_vec/common.hpp b/packages/sacado/test/performance/mat_vec/common.hpp
new file mode 100644
index 000000000000..246efd24e5d1
--- /dev/null
+++ b/packages/sacado/test/performance/mat_vec/common.hpp
@@ -0,0 +1,45 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#pragma once
+
+//#define SACADO_DISABLE_FAD_VIEW_SPEC
+
+#include "impl/Kokkos_Timer.hpp"
+
+struct Perf {
+  double time;
+  double flops;
+  double throughput;
+};
+
+const int SFadSize  = 32;
+const int SLFadSize = SFadSize;
+const int HierSFadSize  = 32;
+const int HierSLFadSize = HierSFadSize;
diff --git a/packages/sacado/test/performance/mat_vec/driver.cpp b/packages/sacado/test/performance/mat_vec/driver.cpp
new file mode 100644
index 000000000000..d7a39362e7f7
--- /dev/null
+++ b/packages/sacado/test/performance/mat_vec/driver.cpp
@@ -0,0 +1,386 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+// A performance test that computes the derivative of a simple Kokkos kernel
+// using various Fad classes
+
+#include "mat_vec.hpp"
+#include "mat_vec_hierarchical.hpp"
+#include "mat_vec_hierarchical_dfad.hpp"
+
+#include "Sacado.hpp"
+
+#include "Teuchos_CommandLineProcessor.hpp"
+#include "Teuchos_StandardCatchMacros.hpp"
+
+// For vtune
+#include <sys/types.h>
+#include <unistd.h>
+#include <algorithm>
+
+void
+print_perf(const Perf& perf, const Perf& perf_base, const size_t p,
+           const std::string& name)
+{
+  std::cout << name << "\t "
+            << perf.time << "\t "
+            << perf.throughput << "\t "
+            << perf.time / perf_base.time
+            << std::endl;
+}
+
+template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
+          typename ... ViewArgs>
+void
+do_times(const size_t m,
+         const size_t n,
+         const size_t p,
+         const size_t nloop,
+         const bool value,
+         const bool analytic,
+         const bool sfad,
+         const bool slfad,
+         const bool dfad,
+         const bool flat,
+         const bool hierarchical,
+         const bool check)
+{
+  Perf perf_value;
+  perf_value.time = 1.0;
+
+  // Run value
+  if (value) {
+    try {
+      Perf perf = do_time_val<ViewArgs...>(m,n,nloop,check);
+      perf_value = perf;
+      print_perf(perf, perf_value, p, "Value     ");
+    }
+    catch(std::exception& e) {
+      std::cout << e.what() << std::endl;
+    }
+  }
+
+  // Run analytic
+  if (analytic) {
+    try {
+      Perf perf =
+        do_time_analytic<ViewArgs...>(m,n,p,nloop,check);
+      print_perf(perf, perf_value, p, "Analytic  ");
+    }
+    catch(std::exception& e) {
+      std::cout << e.what() << std::endl;
+    }
+  }
+  if(analytic && p == SFadSize) {
+    try {
+      Perf perf =
+        do_time_analytic_s<SFadSize, ViewArgs...>(m,n,nloop,check);
+      print_perf(perf, perf_value, p, "Analytic-s");
+    }
+    catch(std::exception& e) {
+      std::cout << e.what() << std::endl;
+    }
+  }
+  if(analytic && p <= SLFadSize) {
+    try {
+      Perf perf =
+        do_time_analytic_sl<SLFadSize, ViewArgs...>(m,n,p,nloop,check);
+      print_perf(perf, perf_value, p, "Analytic-sl");
+    }
+    catch(std::exception& e) {
+      std::cout << e.what() << std::endl;
+    }
+  }
+
+  // Run flat SFad
+  if (flat && sfad && p == SFadSize) {
+    try {
+      Perf perf =
+        do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,check);
+      print_perf(perf, perf_value, p, "SFad      ");
+    }
+    catch(std::exception& e) {
+      std::cout << e.what() << std::endl;
+    }
+  }
+
+  // Run flat SLFad
+  if (flat && slfad && p <= SLFadSize) {
+    try {
+      Perf perf =
+        do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,check);
+      print_perf(perf, perf_value, p, "SLFad     ");
+    }
+    catch(std::exception& e) {
+      std::cout << e.what() << std::endl;
+    }
+  }
+
+  // Run flat DFad
+  if (flat && dfad) {
+    try {
+      Perf perf =
+        do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
+      print_perf(perf, perf_value, p, "DFad      ");
+    }
+    catch(std::exception& e) {
+      std::cout << e.what() << std::endl;
+    }
+    try {
+      Perf perf_scratch =
+        do_time_scratch<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
+      print_perf(perf_scratch, perf_value, p, "DFad Scratch");
+    }
+    catch(std::exception& e) {
+      std::cout << e.what() << std::endl;
+    }
+  }
+
+  // Run hierarchical SFad
+  if (hierarchical && sfad && p == HierSFadSize) {
+    try {
+      Perf perf =
+        do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,p,nloop,check);
+      print_perf(perf, perf_value, p, "H. SFad   ");
+    }
+    catch(std::exception& e) {
+      std::cout << e.what() << std::endl;
+    }
+  }
+
+  // Run hierarchical SLFad
+  if (hierarchical && slfad && p <= HierSLFadSize) {
+    try {
+      Perf perf =
+        do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,p,nloop,check);
+      print_perf(perf, perf_value, p, "H. SLFad  ");
+    }
+    catch(std::exception& e) {
+      std::cout << e.what() << std::endl;
+    }
+  }
+
+  // Run hierarchical DFad
+  if (hierarchical && dfad) {
+    try {
+      Perf perf =
+        do_time_fad_hierarchical_dfad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
+      print_perf(perf, perf_value, p, "H. DFad   ");
+    }
+    catch(std::exception& e) {
+      std::cout << e.what() << std::endl;
+    }
+    try {
+      Perf perf_scratch =
+        do_time_fad_hierarchical_dfad_scratch<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
+      print_perf(perf_scratch, perf_value, p, "H. DFad Scratch");
+    }
+    catch(std::exception& e) {
+      std::cout << e.what() << std::endl;
+    }
+  }
+
+}
+
+enum LayoutType {
+  LAYOUT_LEFT=0,
+  LAYOUT_RIGHT,
+  LAYOUT_DEFAULT
+};
+const int num_layout_types = 3;
+const LayoutType layout_values[] = {
+  LAYOUT_LEFT, LAYOUT_RIGHT, LAYOUT_DEFAULT };
+const char *layout_names[] = { "left", "right", "default" };
+
+template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
+          typename Device>
+void
+do_times_layout(const size_t m,
+                const size_t n,
+                const size_t p,
+                const size_t nloop,
+                const bool value,
+                const bool analytic,
+                const bool sfad,
+                const bool slfad,
+                const bool dfad,
+                const bool flat,
+                const bool hierarchical,
+                const bool check,
+                const LayoutType& layout,
+                const std::string& device)
+{
+  int prec = 2;
+  std::cout.setf(std::ios::scientific);
+  std::cout.precision(prec);
+  std::cout << std::endl
+            << device
+            << " performance for layout "
+            << layout_names[layout]
+            << " m = " << m << " n = " << n << " p = " << p
+            << std::endl << std::endl;
+  std::cout << "Computation \t Time     \t Throughput \t Ratio" << std::endl;
+
+  if (layout == LAYOUT_LEFT)
+    do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
+      m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
+  else if (layout == LAYOUT_RIGHT)
+    do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
+      m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
+  else
+    do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
+      (m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check);
+}
+
+// Connect executable to vtune for profiling
+void connect_vtune() {
+  std::stringstream cmd;
+  pid_t my_os_pid=getpid();
+  const std::string vtune_loc =
+    "amplxe-cl";
+  const std::string output_dir = "./vtune";
+  cmd << vtune_loc
+      << " -collect hotspots -result-dir " << output_dir
+      << " -target-pid " << my_os_pid << " &";
+  std::cout << cmd.str() << std::endl;
+  system(cmd.str().c_str());
+  system("sleep 10");
+}
+
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc,argv);
+
+  bool success = true;
+  try {
+
+    // Set up command line options
+    Teuchos::CommandLineProcessor clp(false);
+    clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
+    int m = 100000;
+    clp.setOption("m", &m, "Number of matrix rows");
+    int n = 100;
+    clp.setOption("n", &n, "Number of matrix columns");
+    int p = SFadSize;
+    clp.setOption("p", &p, "Number of derivative components");
+    int nloop = 10;
+    clp.setOption("nloop", &nloop, "Number of loops");
+#ifdef KOKKOS_ENABLE_SERIAL
+    bool serial = 0;
+    clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+    bool openmp = 0;
+    clp.setOption("openmp", "no-openmp", &openmp, "Whether to run OpenMP");
+#endif
+#ifdef KOKKOS_ENABLE_THREADS
+    bool threads = 0;
+    clp.setOption("threads", "no-threads", &threads, "Whether to run Threads");
+#endif
+#ifdef KOKKOS_ENABLE_CUDA
+    bool cuda = 0;
+    clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");
+#endif
+    bool print_config = false;
+    clp.setOption("print-config", "no-print-config", &print_config,
+                  "Whether to print Kokkos device configuration");
+    LayoutType layout = LAYOUT_DEFAULT;
+    clp.setOption("layout", &layout, num_layout_types, layout_values,
+                  layout_names, "View layout");
+    bool vtune = false;
+    clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune");
+    bool value = true;
+    clp.setOption("value", "no-value", &value, "Run value calculation");
+    bool analytic = true;
+    clp.setOption("analytic", "no-analytic", &analytic,
+                  "Run analytic derivative calculation");
+    bool sfad = true;
+    clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation");
+    bool slfad = true;
+    clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation");
+    bool dfad = true;
+    clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation");
+    bool flat = true;
+    clp.setOption("flat", "no-flat", &flat, "Run flat Fad derivative calculation");
+    bool hierarchical = true;
+    clp.setOption("hierarchical", "no-hierarchical", &hierarchical, "Run hierarchical Fad derivative calculation");
+    bool check = false;
+    clp.setOption("check", "no-check", &check, "Check calculations are correct");
+
+    // Parse options
+    switch (clp.parse(argc, argv)) {
+      case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED:
+        return 0;
+      case Teuchos::CommandLineProcessor::PARSE_ERROR:
+      case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION:
+        return 1;
+      case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL:
+        break;
+    }
+
+    if (vtune)
+      connect_vtune();
+
+    if (print_config)
+      Kokkos::print_configuration(std::cout, true);
+
+#ifdef KOKKOS_ENABLE_SERIAL
+    if (serial) {
+      do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
+        m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Serial");
+    }
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+    if (openmp) {
+      do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
+        m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"OpenMP");
+    }
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+    if (threads) {
+      do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
+        m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Threads");
+    }
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+    if (cuda) {
+      do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
+        m,n,p,nloop,value,analytic,sfad,slfad,dfad,flat,hierarchical,check,layout,"Cuda");
+    }
+#endif
+
+  }
+  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
+
+  Kokkos::finalize();
+
+  return !success;
+}
diff --git a/packages/sacado/test/performance/mat_vec/fad_kokkos_mat_vec_perf.cpp b/packages/sacado/test/performance/mat_vec/fad_kokkos_mat_vec_perf.cpp
new file mode 100644
index 000000000000..e5c49429bb37
--- /dev/null
+++ b/packages/sacado/test/performance/mat_vec/fad_kokkos_mat_vec_perf.cpp
@@ -0,0 +1,680 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#define SACADO_VIEW_CUDA_HIERARCHICAL 1
+//#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
+//#define SACADO_KOKKOS_USE_MEMORY_POOL 1
+#define SACADO_ALIGN_SFAD 1
+
+//#define SACADO_DISABLE_FAD_VIEW_SPEC
+#include "Sacado.hpp"
+
+#include "Teuchos_CommandLineProcessor.hpp"
+#include "Teuchos_StandardCatchMacros.hpp"
+#include "Teuchos_Time.hpp"
+
+#include "impl/Kokkos_Timer.hpp"
+
+// For vtune
+#include <sys/types.h>
+#include <unistd.h>
+#include <algorithm>
+
+// A performance test that computes the derivative of a simple Kokkos kernel
+// using various Fad classes
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {
+  typedef typename ViewTypeC::value_type scalar_type;
+  typedef typename ViewTypeC::execution_space execution_space;
+
+  const int m = A.extent(0);
+  const int n = A.extent(1);
+  Kokkos::parallel_for(
+    Kokkos::RangePolicy<execution_space>( 0,m ),
+    KOKKOS_LAMBDA (const int i) {
+      scalar_type t = 0.0;
+      for (int j=0; j<n; ++j)
+        t += A(i,j)*b(j);
+      c(i) = t;
+    }
+  );
+}
+
+#if defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
+                              const ViewTypeC& c) {
+  typedef typename ViewTypeC::value_type scalar_type;
+  typedef typename ViewTypeC::execution_space execution_space;
+
+#if defined (KOKKOS_ENABLE_CUDA)
+  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
+#else
+  const bool is_cuda = false;
+#endif
+  const unsigned vector_size = is_cuda ? 32 : 1;
+  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
+
+  const int m = A.extent(0);
+  const int n = A.extent(1);
+  const int range = (m+team_size-1)/team_size;
+
+  typedef Kokkos::TeamPolicy<execution_space> Policy;
+  Kokkos::parallel_for(
+    Policy( range,team_size,vector_size ),
+    KOKKOS_LAMBDA (const typename Policy::member_type& team) {
+      const int i = team.league_rank()*team.team_size() + team.team_rank();
+      if (i >= m)
+        return;
+
+      scalar_type t = 0.0;
+      for (int j=0; j<n; ++j)
+        t += A(i,j)*b(j);
+      c(i) = t;
+    }
+  );
+}
+
+#elif defined(SACADO_VIEW_CUDA_HIERARCHICAL)
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
+                              const ViewTypeC& c) {
+  typedef typename Kokkos::ThreadLocalScalarType<ViewTypeC>::type scalar_type;
+  typedef typename ViewTypeC::execution_space execution_space;
+
+#if defined (KOKKOS_ENABLE_CUDA)
+  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
+#else
+  const bool is_cuda = false;
+#endif
+  const unsigned vector_size = is_cuda ? 32 : 1;
+  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
+
+  const int m = A.extent(0);
+  const int n = A.extent(1);
+  const int range = (m+team_size-1)/team_size;
+
+  typedef Kokkos::TeamPolicy<execution_space> Policy;
+  Kokkos::parallel_for(
+    Policy( range,team_size,vector_size ),
+    KOKKOS_LAMBDA (const typename Policy::member_type& team) {
+      const int i = team.league_rank()*team.team_size() + team.team_rank();
+      if (i >= m)
+        return;
+
+      scalar_type t = 0.0;
+      for (int j=0; j<n; ++j)
+        t += A(i,j)*b(j);
+      c(i) = t;
+    }
+  );
+}
+
+#else
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
+                              const ViewTypeC& c) {
+  typedef typename ViewTypeC::value_type scalar_type;
+  typedef typename ViewTypeC::execution_space execution_space;
+
+#if defined (KOKKOS_ENABLE_CUDA)
+  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
+#else
+  const bool is_cuda = false;
+#endif
+  const unsigned vector_size = 1;
+  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
+
+  const int m = A.extent(0);
+  const int n = A.extent(1);
+  const int range = (m+team_size-1)/team_size;
+
+  typedef Kokkos::TeamPolicy<execution_space> Policy;
+  Kokkos::parallel_for(
+    Policy( range,team_size,vector_size ),
+    KOKKOS_LAMBDA (const typename Policy::member_type& team) {
+      const int i = team.league_rank()*team.team_size() + team.team_rank();
+      if (i >= m)
+        return;
+
+      scalar_type t = 0.0;
+      for (int j=0; j<n; ++j)
+        t += A(i,j)*b(j);
+      c(i) = t;
+    }
+  );
+}
+
+#endif
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void
+check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
+{
+  const double tol = 1.0e-14;
+  typedef typename ViewTypeC::value_type value_type;
+  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
+  Kokkos::deep_copy(h_c, c);
+  const size_t m = A.extent(0);
+  const size_t n = A.extent(1);
+  for (size_t i=0; i<m; ++i) {
+    value_type t = n;
+    if (std::abs(h_c(i)- t) > tol) {
+      std::cout << "Comparison failed!  " << i << " : " << h_c(i) << " , " << t
+                << std::endl;
+    }
+  }
+}
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void
+check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
+{
+  const double tol = 1.0e-14;
+  typedef typename ViewTypeC::value_type value_type;
+  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
+  Kokkos::deep_copy(h_c, c);
+  const size_t m = A.extent(0);
+  const size_t n = A.extent(1);
+  const size_t p = Kokkos::dimension_scalar(A);
+  for (size_t i=0; i<m; ++i) {
+    for (size_t j=0; j<p; ++j) {
+      value_type t = (j == p-1 ? n : 2*n);
+      if (std::abs(h_c(i).fastAccessDx(j)- t) > tol) {
+        std::cout << "Comparison failed!  " << i << "," << j << " : "
+                  << h_c(i).fastAccessDx(j) << " , " << t << std::endl;
+      }
+    }
+  }
+}
+
+struct Perf {
+  double time;
+  double flops;
+  double throughput;
+};
+
+template <typename FadType, typename ... ViewArgs>
+Perf
+do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop,
+            const bool check)
+{
+  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
+  typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeB;
+  typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeC;
+  typedef typename ViewTypeA::execution_space execution_space;
+
+  ViewTypeA A("A",m,n,p+1);
+  ViewTypeB b("B",n,p+1);
+  ViewTypeC c("c",m,p+1);
+
+  FadType a(p, 1.0);
+  for (size_t k=0; k<p; ++k)
+    a.fastAccessDx(k) = 1.0;
+  Kokkos::deep_copy(A, a);
+  Kokkos::deep_copy(b, a);
+
+  Kokkos::Impl::Timer wall_clock;
+  Perf perf;
+
+  // Execute the kernel once to warm up
+  run_mat_vec( A, b, c );
+  execution_space().fence();
+
+  wall_clock.reset();
+  for (size_t l=0; l<nloop; l++) {
+    run_mat_vec( A, b, c );
+  }
+  execution_space().fence();
+
+  perf.time = wall_clock.seconds() / nloop;
+  perf.flops = m*n*(2+4*p);
+  perf.throughput = perf.flops / perf.time / 1.0e9;
+
+  if (check) {
+    check_deriv(A, b, c);
+  }
+
+  return perf;
+}
+
+template <typename FadType, typename ... ViewArgs>
+Perf
+do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p,
+                         const size_t nloop, const bool check)
+{
+  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
+  typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeB;
+  typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeC;
+  typedef typename ViewTypeA::execution_space execution_space;
+
+#if defined (KOKKOS_ENABLE_CUDA)
+  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
+#else
+  const bool is_cuda = false;
+#endif
+
+#if defined(SACADO_VIEW_CUDA_HIERARCHICAL)
+  const int FadStride = is_cuda ? 32 : 1;
+#if defined(SACADO_ALIGN_SFAD)
+  const int N = Sacado::StaticSize<FadType>::value;
+  const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
+  const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride : p;
+  typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
+#else
+  typedef FadType AlignedFadType;
+  const size_t pa = p;
+#endif
+#else
+  const int FadStride = 1;
+  typedef FadType AlignedFadType;
+  const size_t pa = p;
+#endif
+
+#if defined(SACADO_VIEW_CUDA_HIERARCHICAL) || defined(SACADO_VIEW_CUDA_HIERARCHICAL_DFAD)
+  typedef Kokkos::LayoutContiguous<typename ViewTypeA::array_layout,FadStride> ConLayoutA;
+  typedef Kokkos::LayoutContiguous<typename ViewTypeB::array_layout,FadStride> ConLayoutB;
+  typedef Kokkos::LayoutContiguous<typename ViewTypeC::array_layout,FadStride> ConLayoutC;
+#else
+  typedef typename ViewTypeA::array_layout ConLayoutA;
+  typedef typename ViewTypeB::array_layout ConLayoutB;
+  typedef typename ViewTypeC::array_layout ConLayoutC;
+  (void) FadStride;
+#endif
+
+
+  typedef Kokkos::View<AlignedFadType**, ConLayoutA, execution_space> ConViewTypeA;
+  typedef Kokkos::View<AlignedFadType*,  ConLayoutB, execution_space> ConViewTypeB;
+  typedef Kokkos::View<AlignedFadType*,  ConLayoutC, execution_space> ConViewTypeC;
+
+  ConViewTypeA A("A",m,n,pa+1);
+  ConViewTypeB b("B",n,pa+1);
+  ConViewTypeC c("c",m,pa+1);
+
+  AlignedFadType a(pa, 1.0);
+  for (size_t k=0; k<pa; ++k)
+    a.fastAccessDx(k) = 1.0;
+  Kokkos::deep_copy(A, a);
+  Kokkos::deep_copy(b, a);
+
+  Kokkos::Impl::Timer wall_clock;
+  Perf perf;
+
+#if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
+  const size_t concurrency = execution_space::concurrency();
+  const size_t warp_dim = is_cuda ? 32 : 1;
+  const size_t block_size = pa*sizeof(double);
+  const size_t nkernels = concurrency / warp_dim;
+  const size_t mem_pool_size =
+    static_cast<size_t>(1.2*nkernels*block_size);
+  const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
+  execution_space space;
+  Sacado::createGlobalMemoryPool(space, mem_pool_size,
+      block_size,
+      block_size,
+      superblock_size
+      );
+#endif
+
+  // Execute the kernel once to warm up
+  run_mat_vec_hierarchical( A, b, c );
+  execution_space().fence();
+
+  wall_clock.reset();
+  for (size_t l=0; l<nloop; l++) {
+    run_mat_vec_hierarchical( A, b, c );
+  }
+  execution_space().fence();
+
+  perf.time = wall_clock.seconds() / nloop;
+  perf.flops = m*n*(2+4*p);
+  perf.throughput = perf.flops / perf.time / 1.0e9;
+
+  if (check) {
+    check_deriv(A, b, c);
+  }
+
+#if defined(SACADO_KOKKOS_USE_MEMORY_POOL)
+  Sacado::destroyGlobalMemoryPool(space);
+#endif
+
+  return perf;
+}
+
+template <typename ... ViewArgs>
+Perf
+do_time_val(const size_t m, const size_t n, const size_t nloop,
+            const bool check)
+{
+  typedef Kokkos::View<double**, ViewArgs...> ViewTypeA;
+  typedef Kokkos::View<double*,  ViewArgs...> ViewTypeB;
+  typedef Kokkos::View<double*,  ViewArgs...> ViewTypeC;
+  typedef typename ViewTypeA::execution_space execution_space;
+
+  ViewTypeA A("A",m,n);
+  ViewTypeB b("B",n);
+  ViewTypeC c("c",m);
+
+  Kokkos::deep_copy(A, 1.0);
+  Kokkos::deep_copy(b, 1.0);
+
+  Kokkos::Impl::Timer wall_clock;
+  Perf perf;
+
+  // Execute the kernel once to warm up
+  run_mat_vec( A, b, c );
+  execution_space().fence();
+
+  wall_clock.reset();
+  for (size_t l=0; l<nloop; l++) {
+    run_mat_vec( A, b, c );
+  }
+  execution_space().fence();
+
+  perf.time = wall_clock.seconds() / nloop;
+  perf.flops = m*n*2;
+  perf.throughput = perf.flops / perf.time / 1.0e9;
+
+  if (check)
+    check_val(A,b,c);
+
+  return perf;
+}
+
+void
+print_perf(const Perf& perf, const Perf& perf_base, const size_t p,
+           const std::string& name)
+{
+  std::cout << name << "\t "
+            << perf.time << "\t "
+            << perf.throughput << "\t "
+            << perf.time / (perf_base.time*p)
+            << std::endl;
+}
+
+template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
+          typename ... ViewArgs>
+void
+do_times(const size_t m,
+         const size_t n,
+         const size_t p,
+         const size_t ph,
+         const size_t nloop,
+         const bool value,
+         const bool sfad,
+         const bool slfad,
+         const bool dfad,
+         const bool hierarchical,
+         const bool check)
+{
+  Perf perf_value;
+  perf_value.time = 1.0;
+
+  // Run value
+  if (value) {
+    Perf perf = do_time_val<ViewArgs...>(m,n,nloop,check);
+    perf_value = perf;
+    print_perf(perf, perf_value, p, "Value     ");
+  }
+
+  // Run SFad
+  if (sfad && p == SFadSize) {
+    Perf perf =
+      do_time_fad<Sacado::Fad::SFad<double,SFadSize>, ViewArgs...>(m,n,p,nloop,check);
+    print_perf(perf, perf_value, p, "SFad      ");
+  }
+
+  // Run SLFad
+  if (slfad && p <= SLFadSize) {
+    Perf perf =
+      do_time_fad<Sacado::Fad::SLFad<double,SLFadSize>, ViewArgs...>(m,n,p,nloop,check);
+    print_perf(perf, perf_value, p, "SLFad     ");
+  }
+
+  // Run DFad
+  if (dfad) {
+    Perf perf =
+      do_time_fad<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,p,nloop,check);
+    print_perf(perf, perf_value, p, "DFad      ");
+  }
+
+  // Run hierarchical
+  if (hierarchical) {
+    if (sfad && ph == HierSFadSize) {
+      Perf perf =
+        do_time_fad_hierarchical<Sacado::Fad::SFad<double,HierSFadSize>, ViewArgs...>(m,n,ph,nloop,check);
+      print_perf(perf, perf_value, ph, "Hier SFad ");
+    }
+    if (slfad && ph <= HierSLFadSize) {
+      Perf perf =
+        do_time_fad_hierarchical<Sacado::Fad::SLFad<double,HierSLFadSize>, ViewArgs...>(m,n,ph,nloop,check);
+      print_perf(perf, perf_value, ph, "Hier SLFad");
+    }
+    if (dfad) {
+      Perf perf =
+        do_time_fad_hierarchical<Sacado::Fad::DFad<double>, ViewArgs...>(m,n,ph,nloop,check);
+      print_perf(perf, perf_value, ph, "Hier DFad ");
+    }
+  }
+
+}
+
+enum LayoutType {
+  LAYOUT_LEFT=0,
+  LAYOUT_RIGHT,
+  LAYOUT_DEFAULT
+};
+const int num_layout_types = 3;
+const LayoutType layout_values[] = {
+  LAYOUT_LEFT, LAYOUT_RIGHT, LAYOUT_DEFAULT };
+const char *layout_names[] = { "left", "right", "default" };
+
+template <int SFadSize, int SLFadSize, int HierSFadSize, int HierSLFadSize,
+          typename Device>
+void
+do_times_layout(const size_t m,
+                const size_t n,
+                const size_t p,
+                const size_t ph,
+                const size_t nloop,
+                const bool value,
+                const bool sfad,
+                const bool slfad,
+                const bool dfad,
+                const bool hierarchical,
+                const bool check,
+                const LayoutType& layout,
+                const std::string& device)
+{
+  int prec = 2;
+  std::cout.setf(std::ios::scientific);
+  std::cout.precision(prec);
+  std::cout << std::endl
+            << device
+            << " performance for layout "
+            << layout_names[layout]
+            << " m = " << m << " n = " << n << " p = " << p << " ph = " << ph
+            << std::endl << std::endl;
+  std::cout << "Computation \t Time     \t Throughput \t Ratio" << std::endl;
+
+  if (layout == LAYOUT_LEFT)
+    do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutLeft,Device>(
+      m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
+  else if (layout == LAYOUT_RIGHT)
+    do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::LayoutRight,Device>(
+      m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
+  else
+    do_times<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Device>
+      (m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check);
+}
+
+// Connect executable to vtune for profiling
+void connect_vtune() {
+  std::stringstream cmd;
+  pid_t my_os_pid=getpid();
+  const std::string vtune_loc =
+    "amplxe-cl";
+  const std::string output_dir = "./vtune";
+  cmd << vtune_loc
+      << " -collect hotspots -result-dir " << output_dir
+      << " -target-pid " << my_os_pid << " &";
+  std::cout << cmd.str() << std::endl;
+  system(cmd.str().c_str());
+  system("sleep 10");
+}
+
+//const int SFadSize  = 8;
+const int SFadSize  = 32;
+const int SLFadSize = SFadSize;
+//const int HierSFadSize  = 50;
+const int HierSFadSize  = 32;
+const int HierSLFadSize = HierSFadSize;
+
+int main(int argc, char* argv[]) {
+  bool success = true;
+  try {
+
+    // Set up command line options
+    Teuchos::CommandLineProcessor clp(false);
+    clp.setDocString("This program tests the speed of various forward mode AD implementations for simple Kokkos kernel");
+    int m = 100000;
+    clp.setOption("m", &m, "Number of matrix rows");
+    int n = 100;
+    clp.setOption("n", &n, "Number of matrix columns");
+    int p = SFadSize;
+    clp.setOption("p", &p, "Number of derivative components");
+    int ph = HierSFadSize;
+    clp.setOption("ph", &ph, "Number of derivative components for hierarchical");
+    int nloop = 10;
+    clp.setOption("nloop", &nloop, "Number of loops");
+#ifdef KOKKOS_ENABLE_SERIAL
+    bool serial = 0;
+    clp.setOption("serial", "no-serial", &serial, "Whether to run Serial");
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+    int openmp = 0;
+    clp.setOption("openmp", &openmp, "Number of OpenMP threads");
+#endif
+#ifdef KOKKOS_ENABLE_THREADS
+    int threads = 0;
+    clp.setOption("threads", &threads, "Number of pThreads threads");
+#endif
+#ifdef KOKKOS_ENABLE_CUDA
+    bool cuda = 0;
+    clp.setOption("cuda", "no-cuda", &cuda, "Whether to run CUDA");
+#endif
+    int numa = 0;
+    clp.setOption("numa", &numa,
+                  "Number of NUMA domains to use (set to 0 to use all NUMAs");
+    int cores_per_numa = 0;
+    clp.setOption("cores-per-numa", &cores_per_numa,
+                  "Number of CPU cores per NUMA to use (set to 0 to use all cores)");
+    bool print_config = false;
+    clp.setOption("print-config", "no-print-config", &print_config,
+                  "Whether to print Kokkos device configuration");
+    LayoutType layout = LAYOUT_DEFAULT;
+    clp.setOption("layout", &layout, num_layout_types, layout_values,
+                  layout_names, "View layout");
+    bool vtune = false;
+    clp.setOption("vtune", "no-vtune", &vtune, "Profile with vtune");
+    bool value = true;
+    clp.setOption("value", "no-value", &value, "Run value calculation");
+    bool sfad = true;
+    clp.setOption("sfad", "no-sfad", &sfad, "Run SFad derivative calculation");
+    bool slfad = true;
+    clp.setOption("slfad", "no-slfad", &slfad, "Run SLFad derivative calculation");
+    bool dfad = true;
+    clp.setOption("dfad", "no-dfad", &dfad, "Run DFad derivative calculation");
+    bool hierarchical = true;
+    clp.setOption("hierarchical", "no-hierarchical", &hierarchical, "Run hierarchical Fad derivative calculation");
+    bool check = false;
+    clp.setOption("check", "no-check", &check, "Check calculations are correct");
+
+    // Parse options
+    switch (clp.parse(argc, argv)) {
+      case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED:
+        return 0;
+      case Teuchos::CommandLineProcessor::PARSE_ERROR:
+      case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION:
+        return 1;
+      case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL:
+        break;
+    }
+
+    if (vtune)
+      connect_vtune();
+
+    Kokkos::InitArguments init_args;
+    init_args.num_threads = cores_per_numa;
+    init_args.num_numa = numa;
+
+    Kokkos::initialize(init_args);
+
+    if (print_config)
+      Kokkos::print_configuration(std::cout, true);
+
+#ifdef KOKKOS_ENABLE_SERIAL
+    if (serial) {
+      do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Serial>(
+        m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Serial");
+    }
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+    if (openmp) {
+      do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::OpenMP>(
+        m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"OpenMP");
+    }
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+    if (threads) {
+      do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Threads>(
+        m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Threads");
+    }
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+    if (cuda) {
+      do_times_layout<SFadSize,SLFadSize,HierSFadSize,HierSLFadSize,Kokkos::Cuda>(
+        m,n,p,ph,nloop,value,sfad,slfad,dfad,hierarchical,check,layout,"Cuda");
+    }
+#endif
+
+    Kokkos::finalize();
+
+  }
+  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success);
+
+  return !success;
+}
diff --git a/packages/sacado/test/performance/mat_vec/mat_vec.cpp b/packages/sacado/test/performance/mat_vec/mat_vec.cpp
new file mode 100644
index 000000000000..3138d758bd9d
--- /dev/null
+++ b/packages/sacado/test/performance/mat_vec/mat_vec.cpp
@@ -0,0 +1,552 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#include "Sacado.hpp"
+
+#include "mat_vec.hpp"
+
+#include "impl/Kokkos_Timer.hpp"
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void run_mat_vec(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {
+  typedef typename ViewTypeC::value_type scalar_type;
+  typedef typename ViewTypeC::execution_space execution_space;
+
+  const int m = A.extent(0);
+  const int n = A.extent(1);
+  Kokkos::parallel_for(
+    Kokkos::RangePolicy<execution_space>( 0,m ),
+    KOKKOS_LAMBDA (const int i) {
+      scalar_type t = 0.0;
+      for (int j=0; j<n; ++j)
+        t += A(i,j)*b(j);
+      c(i) = t;
+    }
+  );
+}
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void
+run_mat_vec_scratch(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
+{
+  typedef typename ViewTypeC::value_type scalar_type;
+  typedef typename ViewTypeC::execution_space execution_space;
+  typedef Kokkos::TeamPolicy<execution_space> Policy;
+  typedef typename Policy::member_type team_member;
+  typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
+
+  const int m = A.extent(0);
+  const int n = A.extent(1);
+  const int p = dimension_scalar(A);
+
+#ifdef KOKKOS_ENABLE_CUDA
+  const bool is_cuda = std::is_same<execution_space,Kokkos::Cuda>::value;
+#else
+  const bool is_cuda = false;
+#endif
+  const int TeamSize = is_cuda ? 128 : 1;
+  const int N = (m+TeamSize-1)/TeamSize;
+  Policy policy(N, TeamSize, 1);
+  const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
+  Kokkos::parallel_for(
+    policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
+    KOKKOS_LAMBDA (const team_member& team) {
+      const int team_rank = team.team_rank();
+      const int team_size = team.team_size();
+      TmpScratchSpace t(team.team_scratch(0), team_size, p);
+      const int i = team.league_rank()*team_size + team_rank;
+      if (i < m) {
+        t(team_rank) = 0.0;
+        for (int j=0; j<n; ++j)
+          t(team_rank) += A(i,j)*b(j);
+        c(i) = t(team_rank);
+      }
+    }
+  );
+}
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void
+run_mat_vec_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
+{
+  typedef typename ViewTypeC::value_type scalar_type;
+  typedef typename ViewTypeC::execution_space execution_space;
+
+  const int m = A.extent(0);
+  const int n = A.extent(1);
+  const int p = A.extent(2)-1;
+  Kokkos::parallel_for(
+    Kokkos::RangePolicy<execution_space>( 0,m ),
+    KOKKOS_LAMBDA (const int i) {
+      c(i,p) = 0.0;
+      for (int k=0; k<p; ++k)
+        c(i,k) = 0.0;
+      for (int j=0; j<n; ++j) {
+        c(i,p) += A(i,j,p)*b(j,p);
+        for (int k=0; k<p; ++k) {
+          c(i,k) += A(i,j,k)*b(j,p) + A(i,j,p)*b(j,k);
+        }
+      }
+    }
+  );
+}
+
+template <int MaxP, typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void
+run_mat_vec_deriv_sl(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
+{
+  typedef typename ViewTypeC::value_type scalar_type;
+  typedef typename ViewTypeC::execution_space execution_space;
+
+  const int m = A.extent(0);
+  const int n = A.extent(1);
+  const int p = A.extent(2)-1;
+  Kokkos::parallel_for(
+    Kokkos::RangePolicy<execution_space>( 0,m ),
+    KOKKOS_LAMBDA (const int i) {
+      scalar_type cv = 0.0;
+      scalar_type t[MaxP];
+      for (int k=0; k<p; ++k)
+        t[k] = 0.0;
+
+      for (int j=0; j<n; ++j) {
+        scalar_type av = A(i,j,p);
+        scalar_type bv = b(j,p);
+        cv += av*bv;
+        for (int k=0; k<p; ++k) {
+          t[k] += A(i,j,k)*bv + av*b(j,k);
+        }
+      }
+
+      for (int k=0; k<p; ++k)
+        c(i,k) = t[k];
+      c(i,p) = cv;
+    }
+  );
+}
+
+template <int p, typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void
+run_mat_vec_deriv_s(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
+{
+  typedef typename ViewTypeC::value_type scalar_type;
+  typedef typename ViewTypeC::execution_space execution_space;
+
+  const int m = A.extent(0);
+  const int n = A.extent(1);
+  Kokkos::parallel_for(
+    Kokkos::RangePolicy<execution_space>( 0,m ),
+    KOKKOS_LAMBDA (const int i) {
+      scalar_type cv = 0.0;
+      scalar_type t[p];
+      for (int k=0; k<p; ++k)
+        t[k] = 0.0;
+
+      for (int j=0; j<n; ++j) {
+        const scalar_type av = A(i,j,p);
+        const scalar_type bv = b(j,p);
+        cv += av*bv;
+
+// Using simd here results in much better performance.  Othewise the compiler
+// appears to try and vectorize the j loop with gather instructions, which
+// doesn't work very well.
+#if defined(__INTEL_COMPILER) && ! defined(__CUDA_ARCH__)
+#pragma simd
+#endif
+        for (int k=0; k<p; ++k) {
+          t[k] += A(i,j,k)*bv + av*b(j,k);
+        }
+      }
+
+      for (int k=0; k<p; ++k)
+        c(i,k) = t[k];
+      c(i,p) = cv;
+    }
+  );
+}
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void
+check_val(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
+{
+  const double tol = 1.0e-14;
+  typedef typename ViewTypeC::value_type value_type;
+  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
+  Kokkos::deep_copy(h_c, c);
+  const size_t m = A.extent(0);
+  const size_t n = A.extent(1);
+  for (size_t i=0; i<m; ++i) {
+    value_type t = n;
+    if (std::abs(h_c(i)- t) > tol) {
+      std::cout << "Comparison failed!  " << i << " : " << h_c(i) << " , " << t
+                << std::endl;
+    }
+  }
+}
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void
+check_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
+{
+  const double tol = 1.0e-14;
+  typedef typename ViewTypeC::value_type value_type;
+  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
+  Kokkos::deep_copy(h_c, c);
+  const size_t m = A.extent(0);
+  const size_t n = A.extent(1);
+  const size_t p = A.extent(2);
+  for (size_t i=0; i<m; ++i) {
+    for (size_t j=0; j<p; ++j) {
+      value_type t = (j == p-1 ? n : 2*n);
+      if (std::abs(h_c(i,j)- t) > tol) {
+        std::cout << "Comparison failed!  " << i << "," << j << " : "
+                  << h_c(i,j) << " , " << t << std::endl;
+      }
+    }
+  }
+}
+
+template <typename ... ViewArgs>
+Perf
+do_time_val(const size_t m, const size_t n, const size_t nloop,
+            const bool check)
+{
+  typedef Kokkos::View<double**, ViewArgs...> ViewTypeA;
+  typedef Kokkos::View<double*,  ViewArgs...> ViewTypeB;
+  typedef Kokkos::View<double*,  ViewArgs...> ViewTypeC;
+  typedef typename ViewTypeA::execution_space execution_space;
+
+  ViewTypeA A("A",m,n);
+  ViewTypeB b("B",n);
+  ViewTypeC c("c",m);
+
+  Kokkos::deep_copy(A, 1.0);
+  Kokkos::deep_copy(b, 1.0);
+
+  Kokkos::Impl::Timer wall_clock;
+  Perf perf;
+
+  // Execute the kernel once to warm up
+  run_mat_vec( A, b, c );
+  execution_space().fence();
+
+  wall_clock.reset();
+  for (size_t l=0; l<nloop; l++) {
+    run_mat_vec( A, b, c );
+  }
+  execution_space().fence();
+
+  perf.time = wall_clock.seconds() / nloop;
+  perf.flops = m*n*2;
+  perf.throughput = perf.flops / perf.time / 1.0e9;
+
+  if (check)
+    check_val(A,b,c);
+
+  return perf;
+}
+
+template <typename FadType, typename ... ViewArgs>
+Perf
+do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop,
+            const bool check)
+{
+  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
+  typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeB;
+  typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeC;
+  typedef typename ViewTypeA::execution_space execution_space;
+
+  // Set amount of memory available for dynamic memory allocation on GPU
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<execution_space,Kokkos::Cuda>::value &&
+      std::is_same<FadType,Sacado::Fad::DFad<double> >::value) {
+    const size_t concurrency = execution_space::concurrency();
+    const size_t mem = std::min(m,concurrency) * p * sizeof(double);
+    //std::cout << "mem = " << mem / (1024*1024) << " MB" << std::endl;
+    cudaDeviceSetLimit(cudaLimitMallocHeapSize, mem);
+  }
+#endif
+
+  ViewTypeA A("A",m,n,p+1);
+  ViewTypeB b("B",n,p+1);
+  ViewTypeC c("c",m,p+1);
+
+  // FadType a(p, 1.0);
+  // for (size_t k=0; k<p; ++k)
+  //   a.fastAccessDx(k) = 1.0;
+  Kokkos::deep_copy(typename ViewTypeA::array_type(A), 1.0);
+  Kokkos::deep_copy(typename ViewTypeB::array_type(b), 1.0);
+
+  Kokkos::Impl::Timer wall_clock;
+  Perf perf;
+
+  // Execute the kernel once to warm up
+  run_mat_vec( A, b, c );
+  execution_space().fence();
+
+  wall_clock.reset();
+  for (size_t l=0; l<nloop; l++) {
+    run_mat_vec( A, b, c );
+  }
+  execution_space().fence();
+
+  perf.time = wall_clock.seconds() / nloop;
+  perf.flops = m*n*(2+4*p);
+  perf.throughput = perf.flops / perf.time / 1.0e9;
+
+  if (check) {
+    typename ViewTypeA::array_type A_flat = A;
+    typename ViewTypeB::array_type b_flat = b;
+    typename ViewTypeC::array_type c_flat = c;
+    check_deriv(A_flat, b_flat, c_flat);
+  }
+
+  return perf;
+}
+
+template <typename FadType, typename ... ViewArgs>
+Perf
+do_time_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop,
+                const bool check)
+{
+  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
+  typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeB;
+  typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeC;
+  typedef typename ViewTypeA::execution_space execution_space;
+
+  ViewTypeA A("A",m,n,p+1);
+  ViewTypeB b("B",n,p+1);
+  ViewTypeC c("c",m,p+1);
+
+  // FadType a(p, 1.0);
+  // for (size_t k=0; k<p; ++k)
+  //   a.fastAccessDx(k) = 1.0;
+  Kokkos::deep_copy(typename ViewTypeA::array_type(A), 1.0);
+  Kokkos::deep_copy(typename ViewTypeB::array_type(b), 1.0);
+
+  Kokkos::Impl::Timer wall_clock;
+  Perf perf;
+
+  // Execute the kernel once to warm up
+  run_mat_vec_scratch( A, b, c );
+  execution_space().fence();
+
+  wall_clock.reset();
+  for (size_t l=0; l<nloop; l++) {
+    run_mat_vec_scratch( A, b, c );
+  }
+  execution_space().fence();
+
+  perf.time = wall_clock.seconds() / nloop;
+  perf.flops = m*n*(2+4*p);
+  perf.throughput = perf.flops / perf.time / 1.0e9;
+
+  if (check) {
+    typename ViewTypeA::array_type A_flat = A;
+    typename ViewTypeB::array_type b_flat = b;
+    typename ViewTypeC::array_type c_flat = c;
+    check_deriv(A_flat, b_flat, c_flat);
+  }
+
+  return perf;
+}
+
+template <typename ... ViewArgs>
+Perf
+do_time_analytic(const size_t m, const size_t n, const size_t p,
+                 const size_t nloop, const bool check)
+{
+  typedef Kokkos::View<double***, ViewArgs...> ViewTypeA;
+  typedef Kokkos::View<double**,  ViewArgs...> ViewTypeB;
+  typedef Kokkos::View<double**,  ViewArgs...> ViewTypeC;
+  typedef typename ViewTypeA::execution_space execution_space;
+
+  ViewTypeA A("A",m,n,p+1);
+  ViewTypeB b("B",n,p+1);
+  ViewTypeC c("c",m,p+1);
+
+  Kokkos::deep_copy(A, 1.0);
+  Kokkos::deep_copy(b, 1.0);
+
+  Kokkos::Impl::Timer wall_clock;
+  Perf perf;
+
+  // Execute the kernel once to warm up
+  run_mat_vec_deriv( A, b, c );
+  execution_space().fence();
+
+  Teuchos::Time timer("mult", false);
+  timer.start(true);
+  for (size_t l=0; l<nloop; l++) {
+    run_mat_vec_deriv( A, b, c );
+  }
+  execution_space().fence();
+  timer.stop();
+
+  perf.time = wall_clock.seconds() / nloop;
+  perf.flops = m*n*(2+4*p);
+  perf.throughput = perf.flops / perf.time / 1.0e9;
+
+  if (check)
+    check_deriv(A,b,c);
+
+  return perf;
+}
+
+template <int MaxP, typename ... ViewArgs>
+Perf
+do_time_analytic_sl(const size_t m, const size_t n, const size_t p,
+                    const size_t nloop, const bool check)
+{
+  typedef Kokkos::View<double***, ViewArgs...> ViewTypeA;
+  typedef Kokkos::View<double**,  ViewArgs...> ViewTypeB;
+  typedef Kokkos::View<double**,  ViewArgs...> ViewTypeC;
+  typedef typename ViewTypeA::execution_space execution_space;
+
+  ViewTypeA A("A",m,n,p+1);
+  ViewTypeB b("B",n,p+1);
+  ViewTypeC c("c",m,p+1);
+
+  Kokkos::deep_copy(A, 1.0);
+  Kokkos::deep_copy(b, 1.0);
+
+  Kokkos::Impl::Timer wall_clock;
+  Perf perf;
+
+  // Execute the kernel once to warm up
+  run_mat_vec_deriv_sl<MaxP>( A, b, c );
+  execution_space().fence();
+
+  Teuchos::Time timer("mult", false);
+  timer.start(true);
+  for (size_t l=0; l<nloop; l++) {
+    run_mat_vec_deriv_sl<MaxP>( A, b, c );
+  }
+  execution_space().fence();
+  timer.stop();
+
+  perf.time = wall_clock.seconds() / nloop;
+  perf.flops = m*n*(2+4*p);
+  perf.throughput = perf.flops / perf.time / 1.0e9;
+
+  if (check)
+    check_deriv(A,b,c);
+
+  return perf;
+}
+
+template <int p, typename ... ViewArgs>
+Perf
+do_time_analytic_s(const size_t m, const size_t n,
+                   const size_t nloop, const bool check)
+{
+  typedef Kokkos::View<double**[p+1], ViewArgs...> ViewTypeA;
+  typedef Kokkos::View<double**,  ViewArgs...> ViewTypeB;
+  typedef Kokkos::View<double**,  ViewArgs...> ViewTypeC;
+  typedef typename ViewTypeA::execution_space execution_space;
+
+  ViewTypeA A("A",m,n);
+  ViewTypeB b("B",n,p+1);
+  ViewTypeC c("c",m,p+1);
+
+  Kokkos::deep_copy(A, 1.0);
+  Kokkos::deep_copy(b, 1.0);
+
+  Kokkos::Impl::Timer wall_clock;
+  Perf perf;
+
+  // Execute the kernel once to warm up
+  run_mat_vec_deriv_s<p>( A, b, c );
+  execution_space().fence();
+
+  Teuchos::Time timer("mult", false);
+  timer.start(true);
+  for (size_t l=0; l<nloop; l++) {
+    run_mat_vec_deriv_s<p>( A, b, c );
+  }
+  execution_space().fence();
+  timer.stop();
+
+  perf.time = wall_clock.seconds() / nloop;
+  perf.flops = m*n*(2+4*p);
+  perf.throughput = perf.flops / perf.time / 1.0e9;
+
+  if (check)
+    check_deriv(A,b,c);
+
+  return perf;
+}
+
+typedef Sacado::Fad::SFad<double,SFadSize> SFad_type;
+typedef Sacado::Fad::SLFad<double,SLFadSize> SLFad_type;
+typedef Sacado::Fad::DFad<double> DFad_type;
+
+#define INST_FUNC_VAL_DEV(DEV) \
+  template Perf do_time_val< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
+  template Perf do_time_val< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
+  template Perf do_time_val< DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check ); \
+  template Perf do_time_analytic< Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
+  template Perf do_time_analytic< Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
+  template Perf do_time_analytic< DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
+  template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
+  template Perf do_time_analytic_sl< SLFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
+  template Perf do_time_analytic_sl< SLFadSize, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check); \
+  template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
+  template Perf do_time_analytic_s< SFadSize, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check); \
+  template Perf do_time_analytic_s< SFadSize, DEV > ( const size_t m, const size_t n, const size_t nloop, const bool check);
+
+#define INST_FUNC_FAD_DEV(FAD,DEV)                                      \
+  template Perf do_time_fad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
+  template Perf do_time_fad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
+  template Perf do_time_fad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
+  template Perf do_time_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
+  template Perf do_time_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
+  template Perf do_time_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
+
+#define INST_FUNC_DEV(DEV)                                       \
+  INST_FUNC_VAL_DEV( DEV )                                       \
+  INST_FUNC_FAD_DEV( SFad_type, DEV )   \
+  INST_FUNC_FAD_DEV( SLFad_type, DEV ) \
+  INST_FUNC_FAD_DEV( DFad_type, DEV )
+
+#ifdef KOKKOS_ENABLE_SERIAL
+INST_FUNC_DEV(Kokkos::Serial)
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+INST_FUNC_DEV(Kokkos::OpenMP)
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+INST_FUNC_DEV(Kokkos::Threads)
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+INST_FUNC_DEV(Kokkos::Cuda)
+#endif
diff --git a/packages/sacado/test/performance/mat_vec/mat_vec.hpp b/packages/sacado/test/performance/mat_vec/mat_vec.hpp
new file mode 100644
index 000000000000..d5d86d2ea21c
--- /dev/null
+++ b/packages/sacado/test/performance/mat_vec/mat_vec.hpp
@@ -0,0 +1,62 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#pragma once
+
+#include "common.hpp"
+
+template <typename ... ViewArgs>
+Perf
+do_time_val(const size_t m, const size_t n, const size_t nloop,
+            const bool check);
+
+template <typename FadType, typename ... ViewArgs>
+Perf
+do_time_fad(const size_t m, const size_t n, const size_t p, const size_t nloop,
+            const bool check);
+
+template <typename FadType, typename ... ViewArgs>
+Perf
+do_time_scratch(const size_t m, const size_t n, const size_t p,
+                const size_t nloop, const bool check);
+
+template <typename ... ViewArgs>
+Perf
+do_time_analytic(const size_t m, const size_t n, const size_t p,
+                 const size_t nloop, const bool check);
+
+template <int MaxP, typename ... ViewArgs>
+Perf
+do_time_analytic_sl(const size_t m, const size_t n, const size_t p,
+                    const size_t nloop, const bool check);
+
+template <int p, typename ... ViewArgs>
+Perf
+do_time_analytic_s(const size_t m, const size_t n,
+                   const size_t nloop, const bool check);
diff --git a/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical.cpp b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical.cpp
new file mode 100644
index 000000000000..9a6f0783096f
--- /dev/null
+++ b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical.cpp
@@ -0,0 +1,190 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#define SACADO_VIEW_CUDA_HIERARCHICAL 1
+#define SACADO_ALIGN_SFAD 1
+
+#include "Sacado.hpp"
+
+#include "mat_vec_hierarchical.hpp"
+
+#include "impl/Kokkos_Timer.hpp"
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void run_mat_vec_hierarchical(const ViewTypeA& A, const ViewTypeB& b,
+                              const ViewTypeC& c) {
+  typedef typename Kokkos::ThreadLocalScalarType<ViewTypeC>::type scalar_type;
+  typedef typename ViewTypeC::execution_space execution_space;
+
+#if defined (KOKKOS_ENABLE_CUDA)
+  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
+#else
+  const bool is_cuda = false;
+#endif
+  const unsigned vector_size = is_cuda ? 32 : 1;
+  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
+
+  const int m = A.extent(0);
+  const int n = A.extent(1);
+  const int range = (m+team_size-1)/team_size;
+
+  typedef Kokkos::TeamPolicy<execution_space> Policy;
+  Kokkos::parallel_for(
+    Policy( range,team_size,vector_size ),
+    KOKKOS_LAMBDA (const typename Policy::member_type& team) {
+      const int i = team.league_rank()*team.team_size() + team.team_rank();
+      if (i >= m)
+        return;
+
+      scalar_type t = 0.0;
+      for (int j=0; j<n; ++j)
+        t += A(i,j)*b(j);
+      c(i) = t;
+    }
+  );
+}
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void
+check_deriv_hierarchical(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
+{
+  const double tol = 1.0e-14;
+  typedef typename ViewTypeC::value_type value_type;
+  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
+  Kokkos::deep_copy(h_c, c);
+  const size_t m = A.extent(0);
+  const size_t n = A.extent(1);
+  const size_t p = Kokkos::dimension_scalar(A);
+  for (size_t i=0; i<m; ++i) {
+    for (size_t j=0; j<p; ++j) {
+      value_type t = (j == p-1 ? n : 2*n);
+      if (std::abs(h_c(i).fastAccessDx(j)- t) > tol) {
+        std::cout << "Comparison failed!  " << i << "," << j << " : "
+                  << h_c(i).fastAccessDx(j) << " , " << t << std::endl;
+      }
+    }
+  }
+}
+
+template <typename FadType, typename ... ViewArgs>
+Perf
+do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p,
+                         const size_t nloop, const bool check)
+{
+  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
+  typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeB;
+  typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeC;
+  typedef typename ViewTypeA::execution_space execution_space;
+
+#if defined (KOKKOS_ENABLE_CUDA)
+  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
+#else
+  const bool is_cuda = false;
+#endif
+
+  const int FadStride = is_cuda ? 32 : 1;
+#if defined(SACADO_ALIGN_SFAD)
+  const int N = Sacado::StaticSize<FadType>::value;
+  const int Nalign = ((N+FadStride-1)/FadStride)*FadStride;
+  const size_t pa = N > 0 ? ((p+FadStride-1)/FadStride)*FadStride : p;
+  typedef typename FadType::template apply_N<Nalign>::type AlignedFadType;
+#else
+  typedef FadType AlignedFadType;
+  const size_t pa = p;
+#endif
+
+  typedef Kokkos::LayoutContiguous<typename ViewTypeA::array_layout,FadStride> ConLayoutA;
+  typedef Kokkos::LayoutContiguous<typename ViewTypeB::array_layout,FadStride> ConLayoutB;
+  typedef Kokkos::LayoutContiguous<typename ViewTypeC::array_layout,FadStride> ConLayoutC;
+
+  typedef Kokkos::View<AlignedFadType**, ConLayoutA, execution_space> ConViewTypeA;
+  typedef Kokkos::View<AlignedFadType*,  ConLayoutB, execution_space> ConViewTypeB;
+  typedef Kokkos::View<AlignedFadType*,  ConLayoutC, execution_space> ConViewTypeC;
+
+  ConViewTypeA A("A",m,n,pa+1);
+  ConViewTypeB b("B",n,pa+1);
+  ConViewTypeC c("c",m,pa+1);
+
+  // AlignedFadType a(pa, 1.0);
+  // for (size_t k=0; k<pa; ++k)
+  //   a.fastAccessDx(k) = 1.0;
+  Kokkos::deep_copy(typename ConViewTypeA::array_type(A), 1.0);
+  Kokkos::deep_copy(typename ConViewTypeB::array_type(b), 1.0);
+
+  Kokkos::Impl::Timer wall_clock;
+  Perf perf;
+
+  // Execute the kernel once to warm up
+  run_mat_vec_hierarchical( A, b, c );
+  execution_space().fence();
+
+  wall_clock.reset();
+  for (size_t l=0; l<nloop; l++) {
+    run_mat_vec_hierarchical( A, b, c );
+  }
+  execution_space().fence();
+
+  perf.time = wall_clock.seconds() / nloop;
+  perf.flops = m*n*(2+4*p);
+  perf.throughput = perf.flops / perf.time / 1.0e9;
+
+  if (check) {
+    check_deriv_hierarchical(A, b, c);
+  }
+
+  return perf;
+}
+
+typedef Sacado::Fad::SFad<double,HierSFadSize> SFad_type;
+typedef Sacado::Fad::SLFad<double,HierSLFadSize> SLFad_type;
+
+#define INST_FUNC_FAD_DEV(FAD,DEV)                                      \
+  template Perf do_time_fad_hierarchical< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
+  template Perf do_time_fad_hierarchical< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
+  template Perf do_time_fad_hierarchical< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
+
+#define INST_FUNC_DEV(DEV)              \
+  INST_FUNC_FAD_DEV( SFad_type, DEV )   \
+  INST_FUNC_FAD_DEV( SLFad_type, DEV )
+
+#ifdef KOKKOS_ENABLE_SERIAL
+INST_FUNC_DEV(Kokkos::Serial)
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+INST_FUNC_DEV(Kokkos::OpenMP)
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+INST_FUNC_DEV(Kokkos::Threads)
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+INST_FUNC_DEV(Kokkos::Cuda)
+#endif
diff --git a/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical.hpp b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical.hpp
new file mode 100644
index 000000000000..03d89ab62b68
--- /dev/null
+++ b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical.hpp
@@ -0,0 +1,37 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#pragma once
+
+#include "common.hpp"
+
+template <typename FadType, typename ... ViewArgs>
+Perf
+do_time_fad_hierarchical(const size_t m, const size_t n, const size_t p,
+                         const size_t nloop, const bool check);
diff --git a/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.cpp b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.cpp
new file mode 100644
index 000000000000..d41f91cd5399
--- /dev/null
+++ b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.cpp
@@ -0,0 +1,285 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
+#define SACADO_KOKKOS_USE_MEMORY_POOL 1
+
+#include "Sacado.hpp"
+
+#include "mat_vec_hierarchical_dfad.hpp"
+
+#include "impl/Kokkos_Timer.hpp"
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void run_mat_vec_hierarchical_dfad(const ViewTypeA& A, const ViewTypeB& b,
+                                   const ViewTypeC& c) {
+  typedef typename ViewTypeC::value_type scalar_type;
+  typedef typename ViewTypeC::execution_space execution_space;
+
+#if defined (KOKKOS_ENABLE_CUDA)
+  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
+#else
+  const bool is_cuda = false;
+#endif
+  const unsigned vector_size = is_cuda ? 32 : 1;
+  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
+
+  const int m = A.extent(0);
+  const int n = A.extent(1);
+  const int range = (m+team_size-1)/team_size;
+
+  typedef Kokkos::TeamPolicy<execution_space> Policy;
+  Kokkos::parallel_for(
+    Policy( range,team_size,vector_size ),
+    KOKKOS_LAMBDA (const typename Policy::member_type& team) {
+      const int i = team.league_rank()*team.team_size() + team.team_rank();
+      if (i >= m)
+        return;
+
+      scalar_type t = 0.0;
+      for (int j=0; j<n; ++j)
+        t += A(i,j)*b(j);
+      c(i) = t;
+    }
+  );
+}
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void run_mat_vec_hierarchical_dfad_scratch(
+  const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {
+  typedef typename ViewTypeC::value_type scalar_type;
+  typedef typename ViewTypeC::execution_space execution_space;
+  typedef Kokkos::TeamPolicy<execution_space> Policy;
+  typedef typename Policy::member_type team_member;
+  typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
+
+#if defined (KOKKOS_ENABLE_CUDA)
+  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
+#else
+  const bool is_cuda = false;
+#endif
+  const unsigned VectorSize = is_cuda ? 32 : 1;
+  const unsigned TeamSize = is_cuda ? 128 / VectorSize : 1;
+
+  const int m = A.extent(0);
+  const int n = A.extent(1);
+  const int p = dimension_scalar(A);
+  const int N = (m+TeamSize-1)/TeamSize;
+
+  Policy policy(N, TeamSize, VectorSize);
+  const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
+  Kokkos::parallel_for(
+    policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
+    KOKKOS_LAMBDA (const typename Policy::member_type& team) {
+      const int team_rank = team.team_rank();
+      const int team_size = team.team_size();
+      TmpScratchSpace t(team.team_scratch(0), team_size, p);
+      const int i = team.league_rank()*team_size + team_rank;
+      if (i < m) {
+        t(team_rank) = 0.0;
+        for (int j=0; j<n; ++j)
+          t(team_rank) += A(i,j)*b(j);
+        c(i) = t(team_rank);
+      }
+    }
+  );
+}
+
+template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
+void
+check_deriv_hierarchical_dfad(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
+{
+  const double tol = 1.0e-14;
+  typedef typename ViewTypeC::value_type value_type;
+  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
+  Kokkos::deep_copy(h_c, c);
+  const size_t m = A.extent(0);
+  const size_t n = A.extent(1);
+  const size_t p = Kokkos::dimension_scalar(A);
+  for (size_t i=0; i<m; ++i) {
+    for (size_t j=0; j<p; ++j) {
+      value_type t = (j == p-1 ? n : 2*n);
+      if (std::abs(h_c(i).fastAccessDx(j)- t) > tol) {
+        std::cout << "Comparison failed!  " << i << "," << j << " : "
+                  << h_c(i).fastAccessDx(j) << " , " << t << std::endl;
+      }
+    }
+  }
+}
+
+template <typename FadType, typename ... ViewArgs>
+Perf
+do_time_fad_hierarchical_dfad(const size_t m, const size_t n, const size_t p,
+                              const size_t nloop, const bool check)
+{
+  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
+  typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeB;
+  typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeC;
+  typedef typename ViewTypeA::execution_space execution_space;
+  typedef Kokkos::LayoutContiguous<typename ViewTypeA::array_layout> ConLayoutA;
+  typedef Kokkos::LayoutContiguous<typename ViewTypeB::array_layout> ConLayoutB;
+  typedef Kokkos::LayoutContiguous<typename ViewTypeC::array_layout> ConLayoutC;
+  typedef Kokkos::View<FadType**, ConLayoutA, execution_space> ConViewTypeA;
+  typedef Kokkos::View<FadType*,  ConLayoutB, execution_space> ConViewTypeB;
+  typedef Kokkos::View<FadType*,  ConLayoutC, execution_space> ConViewTypeC;
+
+  ConViewTypeA A("A",m,n,p+1);
+  ConViewTypeB b("B",n,p+1);
+  ConViewTypeC c("c",m,p+1);
+
+  // FadType a(p, 1.0);
+  // for (size_t k=0; k<p; ++k)
+  //   a.fastAccessDx(k) = 1.0;
+  Kokkos::deep_copy(typename ConViewTypeA::array_type(A), 1.0);
+  Kokkos::deep_copy(typename ConViewTypeB::array_type(b), 1.0);
+
+  Kokkos::Impl::Timer wall_clock;
+  Perf perf;
+
+#if defined (KOKKOS_ENABLE_CUDA)
+  const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
+#else
+  const bool is_cuda = false;
+#endif
+  const size_t concurrency = execution_space::concurrency();
+  const size_t warp_dim = is_cuda ? 32 : 1;
+  const size_t block_size = p*sizeof(double);
+  const size_t nkernels = concurrency / warp_dim;
+  const size_t mem_pool_size =
+    static_cast<size_t>(1.2*nkernels*block_size);
+  const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
+  execution_space space;
+  Sacado::createGlobalMemoryPool(space, mem_pool_size,
+      block_size,
+      block_size,
+      superblock_size
+      );
+
+  // Execute the kernel once to warm up
+  run_mat_vec_hierarchical_dfad( A, b, c );
+  execution_space().fence();
+
+  wall_clock.reset();
+  for (size_t l=0; l<nloop; l++) {
+    run_mat_vec_hierarchical_dfad( A, b, c );
+  }
+  execution_space().fence();
+
+  perf.time = wall_clock.seconds() / nloop;
+  perf.flops = m*n*(2+4*p);
+  perf.throughput = perf.flops / perf.time / 1.0e9;
+
+  if (check) {
+    check_deriv_hierarchical_dfad(A, b, c);
+  }
+
+  Sacado::destroyGlobalMemoryPool(space);
+
+  return perf;
+}
+
+template <typename FadType, typename ... ViewArgs>
+Perf
+do_time_fad_hierarchical_dfad_scratch(
+  const size_t m, const size_t n, const size_t p, const size_t nloop,
+  const bool check)
+{
+  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
+  typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeB;
+  typedef Kokkos::View<FadType*,  ViewArgs...> ViewTypeC;
+  typedef typename ViewTypeA::execution_space execution_space;
+  typedef Kokkos::LayoutContiguous<typename ViewTypeA::array_layout> ConLayoutA;
+  typedef Kokkos::LayoutContiguous<typename ViewTypeB::array_layout> ConLayoutB;
+  typedef Kokkos::LayoutContiguous<typename ViewTypeC::array_layout> ConLayoutC;
+  typedef Kokkos::View<FadType**, ConLayoutA, execution_space> ConViewTypeA;
+  typedef Kokkos::View<FadType*,  ConLayoutB, execution_space> ConViewTypeB;
+  typedef Kokkos::View<FadType*,  ConLayoutC, execution_space> ConViewTypeC;
+
+  ConViewTypeA A("A",m,n,p+1);
+  ConViewTypeB b("B",n,p+1);
+  ConViewTypeC c("c",m,p+1);
+
+  // FadType a(p, 1.0);
+  // for (size_t k=0; k<p; ++k)
+  //   a.fastAccessDx(k) = 1.0;
+  Kokkos::deep_copy(typename ConViewTypeA::array_type(A), 1.0);
+  Kokkos::deep_copy(typename ConViewTypeB::array_type(b), 1.0);
+
+  Kokkos::Impl::Timer wall_clock;
+  Perf perf;
+
+  // Execute the kernel once to warm up
+  run_mat_vec_hierarchical_dfad_scratch( A, b, c );
+  execution_space().fence();
+
+  wall_clock.reset();
+  for (size_t l=0; l<nloop; l++) {
+    run_mat_vec_hierarchical_dfad_scratch( A, b, c );
+  }
+  execution_space().fence();
+
+  perf.time = wall_clock.seconds() / nloop;
+  perf.flops = m*n*(2+4*p);
+  perf.throughput = perf.flops / perf.time / 1.0e9;
+
+  if (check) {
+    check_deriv_hierarchical_dfad(A, b, c);
+  }
+
+  return perf;
+}
+
+typedef Sacado::Fad::DFad<double> DFad_type;
+
+#define INST_FUNC_FAD_DEV(FAD,DEV)                                      \
+  template Perf do_time_fad_hierarchical_dfad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
+  template Perf do_time_fad_hierarchical_dfad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
+  template Perf do_time_fad_hierarchical_dfad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
+  template Perf do_time_fad_hierarchical_dfad_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
+  template Perf do_time_fad_hierarchical_dfad_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
+  template Perf do_time_fad_hierarchical_dfad_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
+
+#define INST_FUNC_DEV(DEV) \
+  INST_FUNC_FAD_DEV( DFad_type, DEV )
+
+#ifdef KOKKOS_ENABLE_SERIAL
+INST_FUNC_DEV(Kokkos::Serial)
+#endif
+
+#ifdef KOKKOS_ENABLE_OPENMP
+INST_FUNC_DEV(Kokkos::OpenMP)
+#endif
+
+#ifdef KOKKOS_ENABLE_THREADS
+INST_FUNC_DEV(Kokkos::Threads)
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+INST_FUNC_DEV(Kokkos::Cuda)
+#endif
diff --git a/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.hpp b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.hpp
new file mode 100644
index 000000000000..f5e6b1c89966
--- /dev/null
+++ b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.hpp
@@ -0,0 +1,43 @@
+// @HEADER
+// ***********************************************************************
+//
+//                           Sacado Package
+//                 Copyright (2006) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// This library is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; either version 2.1 of the
+// License, or (at your option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// Lesser General Public License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with this library; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+// USA
+// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
+// (etphipp@sandia.gov).
+//
+// ***********************************************************************
+// @HEADER
+
+#pragma once
+
+#include "common.hpp"
+
+template <typename FadType, typename ... ViewArgs>
+Perf
+do_time_fad_hierarchical_dfad(const size_t m, const size_t n, const size_t p,
+                              const size_t nloop, const bool check);
+
+template <typename FadType, typename ... ViewArgs>
+Perf
+do_time_fad_hierarchical_dfad_scratch(
+  const size_t m, const size_t n, const size_t p, const size_t nloop,
+  const bool check);

From 9cfc49055119b7ef470af6ed192ebee409d31dd8 Mon Sep 17 00:00:00 2001
From: Eric Phipps <etphipp@sandia.gov>
Date: Fri, 15 May 2020 14:09:13 -0600
Subject: [PATCH 5/8] Sacado:  Add specializations for ScalarValue, Value for
 const types.

---
 packages/sacado/src/Sacado_Traits.hpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/packages/sacado/src/Sacado_Traits.hpp b/packages/sacado/src/Sacado_Traits.hpp
index 23ae54e1fe43..c1d4fb259655 100644
--- a/packages/sacado/src/Sacado_Traits.hpp
+++ b/packages/sacado/src/Sacado_Traits.hpp
@@ -365,6 +365,15 @@ namespace Sacado {
     static const T& eval(const T& x) { return x; }
   };
 
+  //! Specialization of Value for const types
+  template <typename T> struct Value<const T> {
+    typedef typename ValueType<T>::type value_type;
+    KOKKOS_INLINE_FUNCTION
+    static const value_type& eval(const T& x) {
+      return Value<T>::eval(x);
+    }
+  };
+
   //! Base template specification for %ScalarValue
   /*!
    * The %ScalarValue functor returns the base scalar value of an AD type,
@@ -375,6 +384,15 @@ namespace Sacado {
     static const T& eval(const T& x) { return x; }
   };
 
+  //! Specialization of ScalarValue for const types
+  template <typename T> struct ScalarValue<const T> {
+    typedef typename ScalarType<T>::type scalar_type;
+    KOKKOS_INLINE_FUNCTION
+    static const scalar_type& eval(const T& x) {
+      return ScalarValue<T>::eval(x);
+    }
+  };
+
   //! Base template specification for marking constants
   template <typename T> struct MarkConstant {
     KOKKOS_INLINE_FUNCTION

From 92a88916a9d64448d9d2f97703223548fa660f7d Mon Sep 17 00:00:00 2001
From: "K. Devine" <kddevin@sandia.gov>
Date: Fri, 15 May 2020 15:04:15 -0600
Subject: [PATCH 6/8] tpetra:  fixed two picky compiler errors (signed vs
 unsigned)

---
 .../core/test/CrsMatrix/CrsMatrix_2DRandomDist.cpp     | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_2DRandomDist.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_2DRandomDist.cpp
index bddeabc16c1f..b8320f2ab972 100644
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_2DRandomDist.cpp
+++ b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_2DRandomDist.cpp
@@ -120,12 +120,12 @@ class generatedNonzeros
     int me = comm->getRank();
 
     // Precompute values needed for distribution 1 (linear row-wise)
-    gno_t nMyRows = nRows / np + (nRows % np > me);
+    gno_t nMyRows = nRows / np + (int(nRows % np) > me);
     gno_t myFirstRow = (me * (nRows / np) + std::min<gno_t>(nRows % np, me));
     gno_t myLastRow = myFirstRow + nMyRows - 1;
 
     // Precompute values needed for distribution 2 (linear column-wise)
-    gno_t nMyCols = nCols / np + (nCols % np > me);
+    gno_t nMyCols = nCols / np + (int(nCols % np) > me);
     gno_t myFirstCol = (me * (nCols / np) + std::min<gno_t>(nCols % np, me));
     gno_t myLastCol = myFirstCol + nMyCols - 1;
 
@@ -173,7 +173,8 @@ class generatedNonzeros
     // Compute prefix sum in offsets array
     offsets.resize(rowIdx.size() + 1);
     offsets[0] = 0;
-    for (size_t row = 0; row < rowIdx.size(); row++)
+    size_t nRowIdx = size_t(rowIdx.size());
+    for (size_t row = 0; row < nRowIdx; row++)
       offsets[row+1] = offsets[row] + nPerRow[row];
   }
 
@@ -208,7 +209,8 @@ class generatedNonzeros
 
     Teuchos::RCP<matrix_t> Amat = Teuchos::rcp(new matrix_t(rowMap, nPerRow()));
   
-    for (size_t r = 0; r < rowIdx.size(); r++) {
+    size_t nRowIdx = size_t(rowIdx.size());
+    for (size_t r = 0; r < nRowIdx; r++) {
       size_t tmp = offsets[r+1] - offsets[r];
       Amat->insertGlobalValues(rowIdx[r], 
                                colIdx(offsets[r],tmp), val(offsets[r],tmp));

From 5d606fe926622bc2f1d174fe94956d99cee125b6 Mon Sep 17 00:00:00 2001
From: Eric Phipps <etphipp@sandia.gov>
Date: Wed, 20 May 2020 14:39:59 -0600
Subject: [PATCH 7/8] Sacado:  The dreaded "const-lambda-capture" gcc compiler
 bug strikes again!

Also fix some unused typedef warnings.
---
 packages/sacado/test/performance/advection/advection.cpp      | 4 ++--
 .../test/performance/advection/advection_hierarchical.cpp     | 2 --
 packages/sacado/test/performance/advection/common.hpp         | 2 --
 .../test/performance/advection_const_basis/advection.cpp      | 4 ++--
 .../advection_const_basis/advection_hierarchical.cpp          | 2 --
 .../sacado/test/performance/advection_const_basis/common.hpp  | 2 --
 packages/sacado/test/performance/mat_vec/mat_vec.cpp          | 1 -
 .../test/performance/mat_vec/mat_vec_hierarchical_dfad.cpp    | 2 +-
 8 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/packages/sacado/test/performance/advection/advection.cpp b/packages/sacado/test/performance/advection/advection.cpp
index 8b4a803aacda..3446b4254722 100644
--- a/packages/sacado/test/performance/advection/advection.cpp
+++ b/packages/sacado/test/performance/advection/advection.cpp
@@ -168,8 +168,8 @@ void run_analytic_team(const FluxView& flux, const WgbView& wgb,
 
   const size_t num_cells = wgb.extent(0);
   const int num_basis    = wgb.extent(1);
-  const int num_points   = wgb.extent(2);
-  const int num_dim      = wgb.extent(3);
+  /*const*/ int num_points   = wgb.extent(2);
+  /*const*/ int num_dim      = wgb.extent(3);
 
   const size_t bytes     = 2*tmp_scratch_type::shmem_size();
   policy_type policy(num_cells,num_basis,32);
diff --git a/packages/sacado/test/performance/advection/advection_hierarchical.cpp b/packages/sacado/test/performance/advection/advection_hierarchical.cpp
index f6377c207dbd..c5858b5299b9 100644
--- a/packages/sacado/test/performance/advection/advection_hierarchical.cpp
+++ b/packages/sacado/test/performance/advection/advection_hierarchical.cpp
@@ -43,7 +43,6 @@ void run_fad_hierarchical_flat(const FluxView& flux, const WgbView& wgb,
                                const ResidualView& residual)
 {
   typedef typename ResidualView::execution_space execution_space;
-  typedef typename ResidualView::non_const_value_type scalar_type;
   typedef typename Kokkos::ThreadLocalScalarType<ResidualView>::type local_scalar_type;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type team_member;
@@ -83,7 +82,6 @@ void run_fad_hierarchical_team(const FluxView& flux, const WgbView& wgb,
                                const ResidualView& residual)
 {
   typedef typename ResidualView::execution_space execution_space;
-  typedef typename ResidualView::non_const_value_type scalar_type;
   typedef typename Kokkos::ThreadLocalScalarType<ResidualView>::type local_scalar_type;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type team_member;
diff --git a/packages/sacado/test/performance/advection/common.hpp b/packages/sacado/test/performance/advection/common.hpp
index 3a8c76257050..2a518a0b646d 100644
--- a/packages/sacado/test/performance/advection/common.hpp
+++ b/packages/sacado/test/performance/advection/common.hpp
@@ -378,8 +378,6 @@ void check_residual(const FluxView& flux, const WgbView& wgb,
                     const SrcView& src, const WbsView& wbs,
                     const ResidualView& residual)
 {
-  typedef typename ResidualView::execution_space execution_space;
-
   // Generate gold residual
   auto residual_gold = compute_gold_residual(flux, wgb, src, wbs);
 
diff --git a/packages/sacado/test/performance/advection_const_basis/advection.cpp b/packages/sacado/test/performance/advection_const_basis/advection.cpp
index cdf84c285ca3..fb95df285c5c 100644
--- a/packages/sacado/test/performance/advection_const_basis/advection.cpp
+++ b/packages/sacado/test/performance/advection_const_basis/advection.cpp
@@ -167,8 +167,8 @@ void run_analytic_team(const FluxView& flux, const WgbView& wgb,
 
   const size_t num_cells = wgb.extent(0);
   const int num_basis    = wgb.extent(1);
-  const int num_points   = wgb.extent(2);
-  const int num_dim      = wgb.extent(3);
+  /*const*/ int num_points   = wgb.extent(2);
+  /*const*/ int num_dim      = wgb.extent(3);
 
   const size_t bytes     = 2*tmp_scratch_type::shmem_size();
   policy_type policy(num_cells,num_basis,32);
diff --git a/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp
index 40aa3082d35c..e66de586f231 100644
--- a/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp
+++ b/packages/sacado/test/performance/advection_const_basis/advection_hierarchical.cpp
@@ -43,7 +43,6 @@ void run_fad_hierarchical_flat(const FluxView& flux, const WgbView& wgb,
                                const ResidualView& residual)
 {
   typedef typename ResidualView::execution_space execution_space;
-  typedef typename ResidualView::non_const_value_type scalar_type;
   typedef typename Kokkos::ThreadLocalScalarType<ResidualView>::type local_scalar_type;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type team_member;
@@ -83,7 +82,6 @@ void run_fad_hierarchical_team(const FluxView& flux, const WgbView& wgb,
                                const ResidualView& residual)
 {
   typedef typename ResidualView::execution_space execution_space;
-  typedef typename ResidualView::non_const_value_type scalar_type;
   typedef typename Kokkos::ThreadLocalScalarType<ResidualView>::type local_scalar_type;
   typedef Kokkos::TeamPolicy<execution_space> policy_type;
   typedef typename policy_type::member_type team_member;
diff --git a/packages/sacado/test/performance/advection_const_basis/common.hpp b/packages/sacado/test/performance/advection_const_basis/common.hpp
index 4f1819ae4c06..a7616179f733 100644
--- a/packages/sacado/test/performance/advection_const_basis/common.hpp
+++ b/packages/sacado/test/performance/advection_const_basis/common.hpp
@@ -339,8 +339,6 @@ void check_residual(const FluxView& flux, const WgbView& wgb,
                     const SrcView& src, const WbsView& wbs,
                     const ResidualView& residual)
 {
-  typedef typename ResidualView::execution_space execution_space;
-
   // Generate gold residual
   auto residual_gold = compute_gold_residual(flux, wgb, src, wbs);
 
diff --git a/packages/sacado/test/performance/mat_vec/mat_vec.cpp b/packages/sacado/test/performance/mat_vec/mat_vec.cpp
index 3138d758bd9d..57153a0a9765 100644
--- a/packages/sacado/test/performance/mat_vec/mat_vec.cpp
+++ b/packages/sacado/test/performance/mat_vec/mat_vec.cpp
@@ -95,7 +95,6 @@ template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
 void
 run_mat_vec_deriv(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
 {
-  typedef typename ViewTypeC::value_type scalar_type;
   typedef typename ViewTypeC::execution_space execution_space;
 
   const int m = A.extent(0);
diff --git a/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.cpp b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.cpp
index d41f91cd5399..a47fd693f614 100644
--- a/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.cpp
+++ b/packages/sacado/test/performance/mat_vec/mat_vec_hierarchical_dfad.cpp
@@ -96,7 +96,7 @@ void run_mat_vec_hierarchical_dfad_scratch(
   const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
   Kokkos::parallel_for(
     policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
-    KOKKOS_LAMBDA (const typename Policy::member_type& team) {
+    KOKKOS_LAMBDA (const team_member& team) {
       const int team_rank = team.team_rank();
       const int team_size = team.team_size();
       TmpScratchSpace t(team.team_scratch(0), team_size, p);

From 355adaf7f8b4068a42b83e4bf53e76392e86f0be Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 20 May 2020 15:02:17 -0600
Subject: [PATCH 8/8] ATDM/cee-rhel6: gnu-7.2.0_openmpi-4.0.2 ->
 gnu-7.2.0_openmpi-4.0.3 (ATDV-353)

---
 ...rhel6_gnu-7.2.0_openmpi-4.0.3_serial_shared_opt.sh} |  0
 cmake/std/atdm/cee-rhel6/all_supported_builds.sh       |  2 +-
 cmake/std/atdm/cee-rhel6/custom_builds.sh              |  8 ++++----
 cmake/std/atdm/cee-rhel6/custom_builds_unit_tests.sh   | 10 +++++-----
 cmake/std/atdm/cee-rhel6/environment.sh                |  4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)
 rename cmake/ctest/drivers/atdm/cee-rhel6/drivers/{Trilinos-atdm-cee-rhel6_gnu-7.2.0_openmpi-4.0.2_serial_shared_opt.sh => Trilinos-atdm-cee-rhel6_gnu-7.2.0_openmpi-4.0.3_serial_shared_opt.sh} (100%)

diff --git a/cmake/ctest/drivers/atdm/cee-rhel6/drivers/Trilinos-atdm-cee-rhel6_gnu-7.2.0_openmpi-4.0.2_serial_shared_opt.sh b/cmake/ctest/drivers/atdm/cee-rhel6/drivers/Trilinos-atdm-cee-rhel6_gnu-7.2.0_openmpi-4.0.3_serial_shared_opt.sh
similarity index 100%
rename from cmake/ctest/drivers/atdm/cee-rhel6/drivers/Trilinos-atdm-cee-rhel6_gnu-7.2.0_openmpi-4.0.2_serial_shared_opt.sh
rename to cmake/ctest/drivers/atdm/cee-rhel6/drivers/Trilinos-atdm-cee-rhel6_gnu-7.2.0_openmpi-4.0.3_serial_shared_opt.sh
diff --git a/cmake/std/atdm/cee-rhel6/all_supported_builds.sh b/cmake/std/atdm/cee-rhel6/all_supported_builds.sh
index 449c26b8e1d9..74b2d7738320 100644
--- a/cmake/std/atdm/cee-rhel6/all_supported_builds.sh
+++ b/cmake/std/atdm/cee-rhel6/all_supported_builds.sh
@@ -5,7 +5,7 @@ export ATDM_CONFIG_CTEST_S_BUILD_NAME_PREFIX=Trilinos-atdm-
 export ATDM_CONFIG_ALL_SUPPORTED_BUILDS=(
   #cee-rhel6_clang-9.0.1_openmpi-4.0.3_serial_static_dbg    # SPARC has installs with this build
   cee-rhel6_clang-9.0.1_openmpi-4.0.3_serial_static_opt    # SPARC CI build
-  cee-rhel6_gnu-7.2.0_openmpi-4.0.2_serial_shared_opt      # SPARC CI build
+  cee-rhel6_gnu-7.2.0_openmpi-4.0.3_serial_shared_opt      # SPARC CI build
   cee-rhel6_intel-18.0.2_mpich2-3.2_openmp_static_opt      # SPARC CI build
   cee-rhel6_intel-19.0.3_intelmpi-2018.4_serial_static_opt # SPARC Nightly bulid
   )
diff --git a/cmake/std/atdm/cee-rhel6/custom_builds.sh b/cmake/std/atdm/cee-rhel6/custom_builds.sh
index f3a9cb3d0fbb..2665c1c76f15 100644
--- a/cmake/std/atdm/cee-rhel6/custom_builds.sh
+++ b/cmake/std/atdm/cee-rhel6/custom_builds.sh
@@ -20,13 +20,13 @@ if atdm_match_any_buildname_keyword \
   # correct matching of of defaults
 
 elif atdm_match_any_buildname_keyword \
-    gnu-7.2.0-openmpi-4.0.2 \
-    gnu-7.2.0_openmpi-4.0.2 \
+    gnu-7.2.0-openmpi-4.0.3 \
+    gnu-7.2.0_openmpi-4.0.3 \
     gnu-7.2.0 \
     gnu-7 \
     gnu \
   ; then
-  export ATDM_CONFIG_COMPILER=GNU-7.2.0_OPENMPI-4.0.2
+  export ATDM_CONFIG_COMPILER=GNU-7.2.0_OPENMPI-4.0.3
   # List default "gnu"* build last of all the 'gnu' builds for correct
   # matching of defaults
 
@@ -57,7 +57,7 @@ else
   echo "*** Supported compilers include:"
   echo "***"
   echo "****  clang-9.0.1-openmpi-4.0.3      (default, default clang)"
-  echo "****  gnu-7.2.0-openmpi-4.0.2        (default gnu)"
+  echo "****  gnu-7.2.0-openmpi-4.0.3        (default gnu)"
   echo "****  intel-18.0.2-mpich2-3.2"
   echo "****  intel-19.0.3-intelmpi-2018.4   (default intel)"
   echo "***"  
diff --git a/cmake/std/atdm/cee-rhel6/custom_builds_unit_tests.sh b/cmake/std/atdm/cee-rhel6/custom_builds_unit_tests.sh
index 3ed8ead64648..4d2625229255 100755
--- a/cmake/std/atdm/cee-rhel6/custom_builds_unit_tests.sh
+++ b/cmake/std/atdm/cee-rhel6/custom_builds_unit_tests.sh
@@ -42,23 +42,23 @@ testAll() {
 
   ATDM_CONFIG_BUILD_NAME=before_gnu-7.2.0-openmpi-4.0.3-after
   . ${ATDM_CONFIG_SCRIPT_DIR}/utils/set_build_options.sh
-  ${_ASSERT_EQUALS_} ${ATDM_CONFIG_COMPILER} GNU-7.2.0_OPENMPI-4.0.2
+  ${_ASSERT_EQUALS_} ${ATDM_CONFIG_COMPILER} GNU-7.2.0_OPENMPI-4.0.3
 
   ATDM_CONFIG_BUILD_NAME=before_gnu-7.2.0_openmpi-4.0.3-after
   . ${ATDM_CONFIG_SCRIPT_DIR}/utils/set_build_options.sh
-  ${_ASSERT_EQUALS_} ${ATDM_CONFIG_COMPILER} GNU-7.2.0_OPENMPI-4.0.2
+  ${_ASSERT_EQUALS_} ${ATDM_CONFIG_COMPILER} GNU-7.2.0_OPENMPI-4.0.3
 
   ATDM_CONFIG_BUILD_NAME=before_gnu-7.2.0-after
   . ${ATDM_CONFIG_SCRIPT_DIR}/utils/set_build_options.sh
-  ${_ASSERT_EQUALS_} ${ATDM_CONFIG_COMPILER} GNU-7.2.0_OPENMPI-4.0.2
+  ${_ASSERT_EQUALS_} ${ATDM_CONFIG_COMPILER} GNU-7.2.0_OPENMPI-4.0.3
 
   ATDM_CONFIG_BUILD_NAME=before_gnu-7-after
   . ${ATDM_CONFIG_SCRIPT_DIR}/utils/set_build_options.sh
-  ${_ASSERT_EQUALS_} ${ATDM_CONFIG_COMPILER} GNU-7.2.0_OPENMPI-4.0.2
+  ${_ASSERT_EQUALS_} ${ATDM_CONFIG_COMPILER} GNU-7.2.0_OPENMPI-4.0.3
 
   ATDM_CONFIG_BUILD_NAME=before_gnu-after
   . ${ATDM_CONFIG_SCRIPT_DIR}/utils/set_build_options.sh
-  ${_ASSERT_EQUALS_} ${ATDM_CONFIG_COMPILER} GNU-7.2.0_OPENMPI-4.0.2
+  ${_ASSERT_EQUALS_} ${ATDM_CONFIG_COMPILER} GNU-7.2.0_OPENMPI-4.0.3
 
   # Check that 'gnus' does not match 'gnu'! (Shows true keyword matching is
   # working)
diff --git a/cmake/std/atdm/cee-rhel6/environment.sh b/cmake/std/atdm/cee-rhel6/environment.sh
index ad5f1538b796..e55066eca7a2 100755
--- a/cmake/std/atdm/cee-rhel6/environment.sh
+++ b/cmake/std/atdm/cee-rhel6/environment.sh
@@ -71,8 +71,8 @@ if  [[ "$ATDM_CONFIG_COMPILER" == "CLANG-9.0.1_OPENMPI-4.0.3" ]]; then
   fi
   export ATDM_CONFIG_MKL_ROOT=${CBLAS_ROOT}
 
-elif [[ "$ATDM_CONFIG_COMPILER" == "GNU-7.2.0_OPENMPI-4.0.2" ]] ; then
-  module load sparc-dev/gcc-7.2.0_openmpi-4.0.2
+elif [[ "$ATDM_CONFIG_COMPILER" == "GNU-7.2.0_OPENMPI-4.0.3" ]] ; then
+  module load sparc-dev/gcc-7.2.0_openmpi-4.0.3
   unset OMP_NUM_THREADS  # SPARC module sets these and we must unset!
   unset OMP_PROC_BIND
   unset OMP_PLACES