From 7d4cb4dda60b871a9c323a80652aebc5e6f02108 Mon Sep 17 00:00:00 2001
From: "Roscoe A. Bartlett" <rabartl@sandia.gov>
Date: Wed, 20 May 2020 13:03:51 -0600
Subject: [PATCH 1/8] Automatic snapshot commit from tribits at 39a9591

Origin repo remote tracking branch: 'github/master'
Origin repo remote repo URL: 'github = git@github.com:TriBITSPub/TriBITS.git'

At commit:

commit 39a959174df25f1ed41394e37c52aab2c77e542e
Author:  Roscoe A. Bartlett <rabartl@sandia.gov>
Date:    Wed May 20 11:50:46 2020 -0600
Summary: Update doc for supporting only a single node (triilnos/Trilinos#2422)
---
 cmake/tribits/ci_support/checkin-test.py      |   4 +-
 .../package_arch/TribitsAddAdvancedTest.cmake |   3 +
 .../package_arch/TribitsAddTestHelpers.cmake  |  38 ++++
 .../TribitsGenerateResourceSpecFile.cmake     |  94 ++++++++++
 .../package_arch/TribitsGlobalMacros.cmake    |  34 ++++
 .../package_arch/TribitsProjectImpl.cmake     |   9 +-
 .../build_ref/TribitsBuildReferenceBody.rst   | 163 ++++++++++++++++--
 7 files changed, 331 insertions(+), 14 deletions(-)
 create mode 100644 cmake/tribits/core/package_arch/TribitsGenerateResourceSpecFile.cmake

diff --git a/cmake/tribits/ci_support/checkin-test.py b/cmake/tribits/ci_support/checkin-test.py
index c78f32232eaf..4b90663801f8 100755
--- a/cmake/tribits/ci_support/checkin-test.py
+++ b/cmake/tribits/ci_support/checkin-test.py
@@ -741,7 +741,9 @@ def runProjectTestsWithCommandLineArgs(commandLineArgs, configuration = {}):
 
   clp.add_option(
     "--project-name", dest="projectName", action="store",
-    help="Set the project's name. This is used to locate various files.",
+    help="Set the project's name. This is used to locate various files."+\
+      "  If not set, then it reads the project name from the PROJECT_NAME"+\
+      " varaible set in the file SRCDIR/ProjectName.cmake.",
     default=None)
 
   clp.add_option(
diff --git a/cmake/tribits/core/package_arch/TribitsAddAdvancedTest.cmake b/cmake/tribits/core/package_arch/TribitsAddAdvancedTest.cmake
index d7f32749db42..b8cf7ef6727f 100644
--- a/cmake/tribits/core/package_arch/TribitsAddAdvancedTest.cmake
+++ b/cmake/tribits/core/package_arch/TribitsAddAdvancedTest.cmake
@@ -1399,6 +1399,9 @@ FUNCTION(TRIBITS_ADD_ADVANCED_TEST TEST_NAME_IN)
         PROCESSORS "${MAX_NUM_PROCESSORS_USED}")
     ENDIF()
 
+    TRIBITS_PRIVATE_ADD_TEST_ADD_ENVIRONMENT_AND_RESOURCE(${TEST_NAME}
+      ${MAX_NUM_PROCESSORS_USED})
+
     IF (SET_DISABLED_AND_MSG)
       TRIBITS_SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES DISABLED ON)
     ENDIF()
diff --git a/cmake/tribits/core/package_arch/TribitsAddTestHelpers.cmake b/cmake/tribits/core/package_arch/TribitsAddTestHelpers.cmake
index 7c0c174b19ac..6bd8188796ed 100644
--- a/cmake/tribits/core/package_arch/TribitsAddTestHelpers.cmake
+++ b/cmake/tribits/core/package_arch/TribitsAddTestHelpers.cmake
@@ -51,6 +51,21 @@ INCLUDE(AdvancedSet)
 INCLUDE(MessageWrapper)
 INCLUDE(TribitsGetCategoriesString)
 
+
+#
+# Do initialization for test helpers
+#
+# This must be run just before the packages define their tests and this macro
+# must be run in the base-level project scope.
+#
+MACRO(TRIBITS_ADD_TEST_HELPERS_INIT)
+  IF (TPL_ENABLE_CUDA)
+    SET(TRIBITS_TEST_EXTRA_ENVIRONMENT CTEST_KOKKOS_DEVICE_TYPE=gpus)
+    SET(TRIBITS_RESOURCES_PER_PROCESS gpus:1)
+  ENDIF()
+ENDMACRO()
+
+
 #
 # Wrapper function for SET_TESTS_PROPERTIES() to be used in unit testing.
 #
@@ -874,6 +889,8 @@ FUNCTION(TRIBITS_PRIVATE_ADD_TEST_POST_PROCESS_ADDED_TEST  TEST_NAME_IN
     TRIBITS_SET_TESTS_PROPERTIES(${TEST_NAME_IN} PROPERTIES DISABLED ON)
   ENDIF()
 
+  TRIBITS_PRIVATE_ADD_TEST_ADD_ENVIRONMENT_AND_RESOURCE(${TEST_NAME_IN} ${NUM_PROCS_USED_IN})
+
   TRIBITS_PRIVATE_ADD_TEST_ADD_LABEL_AND_KEYWORDS(${TEST_NAME_IN})
 
   TRIBITS_PRIVATE_ADD_TEST_PRINT_ADDED(${TEST_NAME_IN}
@@ -883,3 +900,24 @@ FUNCTION(TRIBITS_PRIVATE_ADD_TEST_POST_PROCESS_ADDED_TEST  TEST_NAME_IN
 ENDFUNCTION()
 
 
+#
+# Add environment and resource properties to a test
+#
+
+FUNCTION(TRIBITS_PRIVATE_ADD_TEST_ADD_ENVIRONMENT_AND_RESOURCE  TEST_NAME_IN
+  NUM_PROCS_USED_IN
+  )
+  IF(TRIBITS_TEST_EXTRA_ENVIRONMENT)
+    TRIBITS_SET_TEST_PROPERTY(${TEST_NAME_IN} APPEND PROPERTY ENVIRONMENT
+      "${TRIBITS_TEST_EXTRA_ENVIRONMENT}")
+  ENDIF()
+
+  IF(TRIBITS_RESOURCES_PER_PROCESS)
+    SET(NUM_PROCESSES ${NUM_PROCS_USED_IN})
+    IF(NOT NUM_PROCESSES OR NUM_PROCESSES LESS 1)
+      SET(NUM_PROCESSES 1)
+    ENDIF()
+    TRIBITS_SET_TESTS_PROPERTIES(${TEST_NAME_IN} PROPERTIES RESOURCE_GROUPS
+      "${NUM_PROCESSES},${TRIBITS_RESOURCES_PER_PROCESS}")
+  ENDIF()
+ENDFUNCTION()
diff --git a/cmake/tribits/core/package_arch/TribitsGenerateResourceSpecFile.cmake b/cmake/tribits/core/package_arch/TribitsGenerateResourceSpecFile.cmake
new file mode 100644
index 000000000000..c50ffab31936
--- /dev/null
+++ b/cmake/tribits/core/package_arch/TribitsGenerateResourceSpecFile.cmake
@@ -0,0 +1,94 @@
+# @HEADER
+# ************************************************************************
+#
+#            TriBITS: Tribal Build, Integrate, and Test System
+#                    Copyright 2013 Sandia Corporation
+#
+# Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+# the U.S. Government retains certain rights in this software.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# ************************************************************************
+# @HEADER
+
+
+#
+# Top-level project logic to generate resources spec file
+#
+FUNCTION(TRIBITS_GENERATE_CTEST_RESOURCE_SPEC_FILE_PROJECT_LOGIC)
+  IF (${PROJECT_NAME}_AUTOGENERATE_TEST_RESOURCE_FILE)
+    IF (CTEST_RESOURCE_SPEC_FILE STREQUAL CTEST_RESOURCE_SPEC_FILE_DEFAULT)
+      TRIBITS_GENERATE_CTEST_RESOURCE_SPEC_FILE()
+    ELSE()
+      MESSAGE("NOTE: The test resource file CTEST_RESOURCE_SPEC_FILE='${CTEST_RESOURCE_SPEC_FILE}'"
+        " will not be auto-generated even through"
+        " ${PROJECT_NAME}_AUTOGENERATE_TEST_RESOURCE_FILE=${${PROJECT_NAME}_AUTOGENERATE_TEST_RESOURCE_FILE}"
+        " because its location does not match the default"
+        " location '${CTEST_RESOURCE_SPEC_FILE_DEFAULT}'."
+        "  If you want to auto-generate this file, please clear CTEST_RESOURCE_SPEC_FILE and"
+        " reconfigure or create that file on your own and clear"
+        " ${PROJECT_NAME}_AUTOGENERATE_TEST_RESOURCE_FILE."
+        )
+    ENDIF()
+  ENDIF()
+ENDFUNCTION()
+
+
+#
+# Generate resource spec file
+#
+FUNCTION(TRIBITS_GENERATE_CTEST_RESOURCE_SPEC_FILE)
+  SET(GPUS_JSON)
+  MATH(EXPR LAST_GPU "${${PROJECT_NAME}_CUDA_NUM_GPUS} - 1")
+  SET(FIRST 1)
+  FOREACH(GPU RANGE 0 ${LAST_GPU})
+    IF(NOT FIRST)
+      STRING(APPEND GPUS_JSON ",\n")
+    ENDIF()
+    SET(FIRST 0)
+    STRING(APPEND GPUS_JSON "        {
+          \"id\": \"${GPU}\",
+          \"slots\": ${${PROJECT_NAME}_CUDA_SLOTS_PER_GPU}
+        }")
+  ENDFOREACH()
+  FILE(WRITE "${CMAKE_BINARY_DIR}/ctest_resources.json" "{
+  \"version\": {
+    \"major\": 1,
+    \"minor\": 0
+  },
+  \"local\": [
+    {
+      \"gpus\": [
+${GPUS_JSON}
+      ]
+    }
+  ]
+}
+")
+ENDFUNCTION()
diff --git a/cmake/tribits/core/package_arch/TribitsGlobalMacros.cmake b/cmake/tribits/core/package_arch/TribitsGlobalMacros.cmake
index b18c23b2f1d8..1cf6c81cf946 100644
--- a/cmake/tribits/core/package_arch/TribitsGlobalMacros.cmake
+++ b/cmake/tribits/core/package_arch/TribitsGlobalMacros.cmake
@@ -487,6 +487,38 @@ MACRO(TRIBITS_DEFINE_GLOBAL_OPTIONS_AND_DEFINE_EXTRA_REPOS)
     "Enable explicit template instantiation in all packages that support it"
     )
 
+  ADVANCED_SET(${PROJECT_NAME}_AUTOGENERATE_TEST_RESOURCE_FILE OFF
+    CACHE BOOL
+    "Auto-generate a resource spec file for use with CTest."
+    )
+
+  ADVANCED_SET(${PROJECT_NAME}_CUDA_NUM_GPUS 1
+    CACHE STRING
+    "Number of GPUS to make available in the auto-generated resource spec file."
+    )
+
+  ADVANCED_SET(${PROJECT_NAME}_CUDA_SLOTS_PER_GPU 1
+    CACHE STRING
+    "Number of slots per GPU in the auto-generated resource spec file."
+    )
+
+  SET(CTEST_RESOURCE_SPEC_FILE_DOC_EXTRA "")
+  IF (${PROJECT_NAME}_AUTOGENERATE_TEST_RESOURCE_FILE)
+    SET(CTEST_RESOURCE_SPEC_FILE_DEFAULT  ${CMAKE_BINARY_DIR}/ctest_resources.json)
+    IF ("${CTEST_RESOURCE_SPEC_FILE}" STREQUAL "")
+      SET(CTEST_RESOURCE_SPEC_FILE_DOC_EXTRA
+         "  This file is autogenerated by default since ${PROJECT_NAME}_AUTOGENERATE_TEST_RESOURCE_FILE=${${PROJECT_NAME}_AUTOGENERATE_TEST_RESOURCE_FILE}!" )
+    ENDIF()
+  ELSE()
+    SET(CTEST_RESOURCE_SPEC_FILE_DEFAULT "")
+  ENDIF()
+
+  ADVANCED_SET(CTEST_RESOURCE_SPEC_FILE
+    "${CTEST_RESOURCE_SPEC_FILE_DEFAULT}"
+    CACHE FILEPATH
+    "Resource spec file for CTest.${CTEST_RESOURCE_SPEC_FILE_DOC_EXTRA}"
+    )
+
   IF (USE_XSDK_DEFAULTS)
     # Need to set BUILD_SHARED_LIBS default here based on USE_XSDK_DEFAULTS
     # and not in TRIBITS_SETUP_ENV() in case there is logic in TriBITS or
@@ -2264,6 +2296,8 @@ MACRO(TRIBITS_INCLUDE_CTEST_SUPPORT)
   TRIBITS_CONFIGURE_CTEST_CUSTOM(${${PROJECT_NAME}_SOURCE_DIR}
     ${${PROJECT_NAME}_BINARY_DIR})
 
+  TRIBITS_ADD_TEST_HELPERS_INIT()
+
 ENDMACRO()
 # NOTE: The above logic with DART_TESTING_TIMEOUT is a huge hack.  For some
 # reason, on the first configure CMake will not put the local value of the
diff --git a/cmake/tribits/core/package_arch/TribitsProjectImpl.cmake b/cmake/tribits/core/package_arch/TribitsProjectImpl.cmake
index c4b07bc23d8a..eae6208fbbcb 100644
--- a/cmake/tribits/core/package_arch/TribitsProjectImpl.cmake
+++ b/cmake/tribits/core/package_arch/TribitsProjectImpl.cmake
@@ -66,6 +66,7 @@ INCLUDE(TribitsIncludeDirectories)
 INCLUDE(TribitsFindPythonInterp)
 INCLUDE(TribitsGlobalMacros)
 INCLUDE(TribitsConfigureCTestCustom)
+INCLUDE(TribitsGenerateResourceSpecFile)
 
 INCLUDE(AdvancedSet)
 INCLUDE(AdvancedOption)
@@ -321,7 +322,13 @@ MACRO(TRIBITS_PROJECT_IMPL)
   ENDIF()
 
   #
-  # N) Show final timing and end
+  # N) Generate resource spec file if applicable
+  #
+
+  TRIBITS_GENERATE_CTEST_RESOURCE_SPEC_FILE_PROJECT_LOGIC()
+
+  #
+  # O) Show final timing and end
   #
 
   MESSAGE("")
diff --git a/cmake/tribits/doc/build_ref/TribitsBuildReferenceBody.rst b/cmake/tribits/doc/build_ref/TribitsBuildReferenceBody.rst
index 8ca2985db9af..5d66d6ac484e 100644
--- a/cmake/tribits/doc/build_ref/TribitsBuildReferenceBody.rst
+++ b/cmake/tribits/doc/build_ref/TribitsBuildReferenceBody.rst
@@ -10,6 +10,16 @@
 
 .. _Ninja: https://ninja-build.org
 
+.. _CMake Ninja Fortran Support: https://cmake.org/cmake/help/latest/generator/Ninja.html
+
+.. _CTest Resource Allocation System: https://cmake.org/cmake/help/latest/manual/ctest.1.html#resource-allocation
+
+.. _CTest Resource Specification File: https://cmake.org/cmake/help/latest/manual/ctest.1.html#ctest-resource-specification-file
+
+.. _CTest Resource Allocation Environment Variables: https://cmake.org/cmake/help/latest/manual/ctest.1.html#environment-variables
+
+.. _RESOURCE_GROUPS: https://cmake.org/cmake/help/latest/prop_test/RESOURCE_GROUPS.html#prop_test:RESOURCE_GROUPS
+
 
 
 Getting set up to use CMake
@@ -50,7 +60,7 @@ To get help for installing CMake with this script use::
 
   $ $TRIBITS_BASE_DIR/devtools_install/install-cmake.py --help
 
-NOTE: you will want to read the help message about how to install CMake to
+NOTE: You will want to read the help message about how to install CMake to
 share with other users and maintainers and how to install with sudo if needed.
 
 
@@ -69,10 +79,17 @@ The Kitware fork of Ninja at:
 
 provides releases of Ninja that allows CMake 3.7.0+ to build Fortran code with
 Ninja.  For example, the Kitware Ninja release ``1.7.2.git.kitware.dyndep-1``
-works with Fortran.
+works with Fortran.  As of Ninja 1.10+, Fortran support is part of the
+official Google-maintained version of Ninja as can be obtained from:
 
-Ninja is easy to install from source.  It is a simple ``configure
---prefix=<dir>``, ``make`` and ``make install``.
+  https://github.com/ninja-build/ninja/releases
+
+and as of CMake 3.17+, cmake will recognize native Fortran support for Ninja
+1.10+ (see `CMake Ninja Fortran Support`_).
+
+Ninja is easy to install from source on almost any machine.  On Unix/Linux
+systems it is as simple as ``configure --prefix=<dir>``, ``make`` and ``make
+install``.
 
 
 Getting CMake Help
@@ -1881,8 +1898,16 @@ NOTES:
   Python is enabled.
 
 
+Test-related configuration settings
+-----------------------------------
+
+Many options can be set at configure time to determine what tests are enabled
+and how they are run.  The following subsections described these various
+settings.
+
+
 Enabling different test categories
-----------------------------------
+++++++++++++++++++++++++++++++++++
 
 To turn on a set a given set of tests by test category, set::
 
@@ -1901,7 +1926,7 @@ and don't nest with the other categories.
 
 
 Disabling specific tests
-------------------------
+++++++++++++++++++++++++
 
 Any TriBITS-added ctest test (i.e. listed in ``ctest -N``) can be disabled at
 configure time by setting::
@@ -1940,7 +1965,7 @@ Also note that other specific defined tests can also be excluded using the
 
 
 Disabling specific test executable builds
------------------------------------------
++++++++++++++++++++++++++++++++++++++++++
 
 Any TriBITS-added executable (i.e. listed in ``make help``) can be disabled
 from being built by setting::
@@ -1956,7 +1981,7 @@ at configure time to CMake STDOUT.
 
 
 Disabling just the ctest tests but not the test executables
------------------------------------------------------------
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
 To allow the building of the tests and examples in a package (enabled either
 through setting `<Project>_ENABLE_TESTS`_ ``= ON`` or
@@ -2016,7 +2041,7 @@ should be build and which tests should be run can be made at configure time.
 
 
 Trace test addition or exclusion
---------------------------------
+++++++++++++++++++++++++++++++++
 
 To see what tests get added and see those that don't get added for various
 reasons, configure with::
@@ -2032,7 +2057,7 @@ arguments).
 
 
 Enable advanced test start and end times and timing blocks
-----------------------------------------------------------
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
 For tests added using ``TRIBITS_ADD_ADVANCED_TEST()``, one can see start and
 end times for the tests and the timing for each ``TEST_<IDX>`` block in the
@@ -2047,8 +2072,9 @@ systems and not native Windows systems.
 
 .. _DART_TESTING_TIMEOUT:
 
+
 Setting test timeouts at configure time
----------------------------------------
++++++++++++++++++++++++++++++++++++++++
 
 A maximum default time limit (timeout) for all the tests can be set at
 configure time using the cache variable::
@@ -2094,8 +2120,9 @@ NOTES:
 
 .. _<Project>_SCALE_TEST_TIMEOUT:
 
+
 Scaling test timeouts at configure time
----------------------------------------
++++++++++++++++++++++++++++++++++++++++
 
 The global default test timeout `DART_TESTING_TIMEOUT`_ as well as all of the
 timeouts for the individual tests that have their own timeout set (through the
@@ -2135,6 +2162,118 @@ NOTES:
   increase ``DART_TESTING_TIMEOUT`` or ``TimeOut`` with each new configure.)
 
 
+Spreading out and limiting tests running on GPUs
+++++++++++++++++++++++++++++++++++++++++++++++++
+
+For CUDA builds (i.e. ``TPL_ENABLE_CUDA=ON``) with tests that run on a single
+node which has multiple GPUs, there are settings that can help ``ctest``
+spread out the testing load over all of the GPUs and limit the number of
+kernels that can run at the same time on a single GPU.
+
+To instruct ``ctest`` to spread out the load on multiple GPUs, one can set the
+following configure-time options::
+
+  -D TPL_ENABLE_CUDA=ON \
+  -D <Project>_AUTOGENERATE_TEST_RESOURCE_FILE=ON \
+  -D <Project>_CUDA_NUM_GPUS=<num-gpus> \
+  -D <Project>_CUDA_SLOTS_PER_GPU=<slots-per-gpu> \
+
+This will cause a file ``ctest_resources.json`` to get generated in the base
+build directory that CTest will use to spread out the work across the
+``<num-gpus>`` GPUs with a maximum of ``<slots-per-gpu>`` processes running
+kernels on any one GPU.  (This uses the `CTest Resource Allocation System`_
+first added in CMake 3.16 and made more usable in CMake 3.18.)
+
+For example, when running on one node on a system with 4 GPUs per node
+(allowing 5 kernels to run at a time on a single GPU) one would configure
+with::
+
+  -D TPL_ENABLE_CUDA=ON \
+  -D <Project>_AUTOGENERATE_TEST_RESOURCE_FILE=ON \
+  -D <Project>_CUDA_NUM_GPUS=4 \
+  -D <Project>_CUDA_SLOTS_PER_GPU=5 \
+
+This allows, for example, up to 5 tests using 4-rank MPI jobs, or 10 tests
+using 2-rank MPI jobs, or 20 tests using 1-rank MPI jobs, to run at the same
+time (or any combination of tests that add up to 20 or less total MPI
+processes to run a the same time).  But a single 21-rank or above MPI test job
+would not be allowed to run and would be listed as "Not Run" because it would
+have required more than ``<slots-per-gpu> = 5`` MPI processes running kernels
+at one time on a single GPU.  (Therefore, one must set ``<slots-per-gpu>``
+large enough to allow all of the defined tests to run or one should avoid
+defining tests that require too many slots for available GPUs.)
+
+The CTest implementation uses a breath-first approach to spread out the work
+across all the available GPUs before adding more work for each GPU.  For
+example, when running two 2-rank MPI tests at the same time (e.g. using
+``ctest -j4``) in the above example, CTest will instruct these tests at
+runtime to spread out across all 4 GPUs and therefore run the CUDA kernels for
+just one MPI process on each GPU.  But when running four 2-rank MPI tests at
+the same time (e.g. using ``ctest -j8``), then each of the 4 GPUs would get
+the work of two MPI processes (i.e. running two kernels at a time on each of
+the 4 GPUs).
+
+One can also manually create a `CTest Resource Specification File`_ and point
+to it by setting::
+
+  -D TPL_ENABLE_CUDA=ON \
+  -D CTEST_RESOURCE_SPEC_FILE=<file-path> \
+
+In all cases, ctest will not spread out and limit running on the GPUs unless
+``TPL_ENABLE_CUDA=ON`` is set which causes TriBITS to add the
+`RESOURCE_GROUPS`_ test property to each test.
+
+NOTES:
+
+* This setup assumes that a single MPI process will run just one kernel on its
+  assigned GPU and therefore take up one GPU "slot".  So a 2-rank MPI test
+  will take up 2 total GPU "slots" (either on the same or two different GPUs,
+  as determined by CTest).
+
+* The underlying test executables/scripts themselves must be set up to read in
+  the `CTest Resource Allocation Environment Variables`_ set specifically by
+  ``ctest`` on the fly for each test and then must run on the specific GPUs
+  specified in those environment variables.  (If the project is using a Kokkos
+  back-end implementation for running CUDA code on the GPU then this will work
+  automatically since Kokkos is set up to automatically look for these
+  CTest-set environment variables.  Without this CTest and TriBITS
+  implementation, when running 2-rank MPI tests on a node with 4 GPUs, Kokkos
+  would just utilize the first two GPUs and leave the other two GPUs idle.
+  One when running 1-rank MPI tests, Kokkos would only utilize the first GPU
+  and leave the last three GPUs idle.)
+
+* The option ``<Project>_AUTOGENERATE_TEST_RESOURCE_FILE=ON`` sets the
+  built-in CMake variable ``CTEST_RESOURCE_SPEC_FILE`` to point to the
+  generated file ``ctest_resources.json`` in the build directory.
+
+* One can avoid setting the CMake cache variables
+  ``<Project>_AUTOGENERATE_TEST_RESOURCE_FILE`` or
+  ``CTEST_RESOURCE_SPEC_FILE`` at configure time and can instead directly pass
+  the path to the `CTest Resource Specification File`_ directly into ``ctest``
+  using the command-line option ``--resource-spec-file`` or the
+  ``ctest_test()`` function argument ``RESOURCE_SPEC_FILE`` (when using a
+  ``ctest -S`` script driver).  (This allows using CMake 3.16+ since support
+  for the ``CTEST_RESOURCE_SPEC_FILE`` cache variable was not added until
+  CMake 3.18.)
+
+* A patched version of CMake 3.17 can be used to get built-in CMake/CTest
+  support for the ``CTEST_RESOURCE_SPEC_FILE`` cache variable, as installed
+  using the TriBITS-provided ``install-cmake.py`` command (using option
+  ``--cmake-version=3.17``, see `Installing CMake from source [developers and
+  experienced users]`_).  This avoids needing to explicitly pass the ctest
+  resource file to ``ctest`` at runtime for CMake/CTest versions [3.16, 3.18).
+
+* **WARNING:** This currently only works for a single node, not multiple
+  nodes.  (CTest needs to be extended to work correctly for multiple nodes
+  where each node has multiple GPUs.  Alternatively, TriBITS could be extended
+  to make this work for multiple nodes but will require considerable work and
+  will need to closely interact with the MPI launcher to control what nodes
+  are run on for each MPI job/test.)
+
+* **WARNING:** This feature is still evolving in CMake/CTest and TriBITS and
+  therefore the input options and behavior of this may change in the future.
+
+
 Enabling support for coverage testing
 -------------------------------------
 

From 1342d5976e4bc83135afa2d4e8a5ecb011bcca25 Mon Sep 17 00:00:00 2001
From: "Roscoe A. Bartlett" <rabartl@sandia.gov>
Date: Mon, 18 May 2020 16:00:09 -0600
Subject: [PATCH 2/8] Allow pointing to a tribits outside of Trilinos
 (trilinos/Trilinos#2422)

This facilitates testing with updates in the external TriBITS git repo.
---
 cmake/std/atdm/checkin-test-atdm.sh              | 9 +++++++--
 cmake/std/atdm/utils/checkin-test-atdm-single.sh | 3 ++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/cmake/std/atdm/checkin-test-atdm.sh b/cmake/std/atdm/checkin-test-atdm.sh
index 5e50ede39c2d..da7b107469b2 100755
--- a/cmake/std/atdm/checkin-test-atdm.sh
+++ b/cmake/std/atdm/checkin-test-atdm.sh
@@ -19,7 +19,6 @@ if [ "$ATDM_TRILINOS_DIR" == "" ] ; then
     export ATDM_TRILINOS_DIR=$STD_ATDM_DIR/../../..
   fi
 fi
-
 echo "ATDM_TRILINOS_DIR = '$ATDM_TRILINOS_DIR'"
 
 if [ "$ATDM_TRILINOS_DIR" == "" ] ; then
@@ -27,6 +26,11 @@ if [ "$ATDM_TRILINOS_DIR" == "" ] ; then
   exit 1
 fi
 
+if [ "$ATDM_TRIBITS_DIR" == "" ] ; then
+  export ATDM_TRIBITS_DIR=$ATDM_TRILINOS_DIR/cmake/tribits
+fi
+echo "ATDM_TRIBITS_DIR = '${ATDM_TRIBITS_DIR}'"
+
 #
 # Load a default env for the system
 #
@@ -241,7 +245,8 @@ echo
 echo "  ==> See output file checkin-test.final.out" 
 echo
 
-$ATDM_TRILINOS_DIR/cmake/tribits/ci_support/checkin-test.py \
+${ATDM_TRIBITS_DIR}/ci_support/checkin-test.py \
+  --src-dir=$ATDM_TRILINOS_DIR \
   --default-builds= --st-extra-builds=$ATDM_BUILD_NAME_KEYS_COMMA_LIST \
   --allow-no-pull "$ATDM_CHT_ENABLE_PACKAGES_ARG" \
   $ATDM_CHT_SEND_EMAIL_TO_ARG \
diff --git a/cmake/std/atdm/utils/checkin-test-atdm-single.sh b/cmake/std/atdm/utils/checkin-test-atdm-single.sh
index 3323c0a683c4..7d256efc93ee 100755
--- a/cmake/std/atdm/utils/checkin-test-atdm-single.sh
+++ b/cmake/std/atdm/utils/checkin-test-atdm-single.sh
@@ -33,7 +33,8 @@ fi
 
 set -x
 
-$ATDM_TRILINOS_DIR/cmake/tribits/ci_support/checkin-test.py \
+${ATDM_TRIBITS_DIR}/ci_support/checkin-test.py \
+  --src-dir=$ATDM_TRILINOS_DIR \
   --make-options="${make_options}" \
   --ctest-options="-j $ATDM_CONFIG_CTEST_PARALLEL_LEVEL" \
   --st-extra-builds=$ATDM_BUILD_NAME_KEYS "$@" \

From bb4e04231e5b1418660cd3158707ef0cef8d1c2c Mon Sep 17 00:00:00 2001
From: "Roscoe A. Bartlett" <rabartl@sandia.gov>
Date: Wed, 13 May 2020 10:37:26 -0600
Subject: [PATCH 3/8] ATDM: waterman: Use cmake 3.17.2 and ctest resource
 limits for GPUs (trilinos/Trilinos#2422)

This is using a special TriBITS-patched version of CMake 3.17.2.

This should spread things out a little better over the GPUs.
---
 cmake/std/atdm/waterman/environment.sh      | 8 ++++++--
 cmake/std/atdm/waterman/tweaks/Tweaks.cmake | 8 ++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/cmake/std/atdm/waterman/environment.sh b/cmake/std/atdm/waterman/environment.sh
index 61f33977e117..7c971f787b34 100755
--- a/cmake/std/atdm/waterman/environment.sh
+++ b/cmake/std/atdm/waterman/environment.sh
@@ -151,10 +151,14 @@ elif [[ "$ATDM_CONFIG_COMPILER" == "CUDA"* ]] ; then
 
 fi
 
-# CMake and ninja
-module swap cmake/3.6.2 cmake/3.12.3
+# Ninja
 module load ninja/1.7.2
 
+# CMake
+#module swap cmake/3.6.2 cmake/3.12.3
+module unload cmake/3.6.2
+export PATH=/home/atdm-devops-admin/tools/waterman/cmake-3.17.2/bin:$PATH
+
 # HWLOC
 
 export ATDM_CONFIG_USE_HWLOC=OFF
diff --git a/cmake/std/atdm/waterman/tweaks/Tweaks.cmake b/cmake/std/atdm/waterman/tweaks/Tweaks.cmake
index 5bb962b15eef..1e95a3639b84 100644
--- a/cmake/std/atdm/waterman/tweaks/Tweaks.cmake
+++ b/cmake/std/atdm/waterman/tweaks/Tweaks.cmake
@@ -1,3 +1,11 @@
+#
+# Set up to limit running on GPUs
+#
+
+ATDM_SET_CACHE(Trilinos_AUTOGENERATE_TEST_RESOURCE_FILE ON CACHE BOOL)
+ATDM_SET_CACHE(Trilinos_CUDA_NUM_GPUS 2 CACHE STRING)
+ATDM_SET_CACHE(Trilinos_CUDA_SLOTS_PER_GPU 2 CACHE STRING)
+
 #
 # Disables across multiple builds on 'waterman'
 #

From 8ed34c5e86405bb81d41d77c28171db89422f495 Mon Sep 17 00:00:00 2001
From: "Roscoe A. Bartlett" <rabartl@sandia.gov>
Date: Wed, 13 May 2020 20:32:40 -0600
Subject: [PATCH 4/8] ATDM: waterman: Reduce from ctest -j4 to -j2
 (trilinos/Trilinos#2422)

This will reduce the number of timeouts and seems to run almost as fast due to
problems with contention for the GPUs.
---
 cmake/std/atdm/waterman/environment.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/std/atdm/waterman/environment.sh b/cmake/std/atdm/waterman/environment.sh
index 7c971f787b34..b32ef020c75b 100755
--- a/cmake/std/atdm/waterman/environment.sh
+++ b/cmake/std/atdm/waterman/environment.sh
@@ -146,8 +146,8 @@ elif [[ "$ATDM_CONFIG_COMPILER" == "CUDA"* ]] ; then
   export CUDA_LAUNCH_BLOCKING=1
   export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1
   export KOKKOS_NUM_DEVICES=2
-  export ATDM_CONFIG_CTEST_PARALLEL_LEVEL=4
-  # Avoids timeouts due to not running on separate GPUs (see #2446)
+  export ATDM_CONFIG_CTEST_PARALLEL_LEVEL=2
+  # Avoids timeouts due to not running on separate GPUs (e.g. see #2446)
 
 fi
 

From 3b308f7bef5360d0f412e481f13f44c6f824b3b0 Mon Sep 17 00:00:00 2001
From: "Roscoe A. Bartlett" <rabartl@sandia.gov>
Date: Mon, 18 May 2020 17:15:03 -0600
Subject: [PATCH 5/8] ATDM: Update documentation for updated 'waterman' env
 (trilinos/Trilinos#2422)

---
 cmake/std/atdm/README.md | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/cmake/std/atdm/README.md b/cmake/std/atdm/README.md
index 1972571dc292..6d4064d74f66 100644
--- a/cmake/std/atdm/README.md
+++ b/cmake/std/atdm/README.md
@@ -1034,7 +1034,7 @@ command to run if using a CUDA build.  For example, to configure, build and
 run the tests for the default `cuda-debug` build for say `MueLu` (after
 cloning Trilinos on the `develop` branch) one would do:
 
-```
+```bash
 $ cd <some_build_dir>/
 
 $ source $TRILINOS_DIR/cmake/std/atdm/load-env.sh cuda-debug
@@ -1047,13 +1047,20 @@ $ cmake \
 
 $ make NP=20
 
-$ bsub -x -Is -n 20 ctest -j4
+$ bsub -x -Is -n 20 ctest -j2
 ```
 
 **NOTE:** While the above example shows loading the environment, configuring
 and building on the login node, one can also do these on the compute nodes as
 well.  In fact, that is what the CTest -S drivers do in automated testing on
-'waterman'.
+'waterman'.  To get an interactive compute node, do:
+
+```
+$ bsub -x -Is -n 20 bash
+```
+
+Then one can configure, build, and run tests interactively on that compute
+node.
 
 Note that one can also run the same build and tests using the <a
 href="#checkin-test-atdmsh">checkin-test-atdm.sh</a> script as:

From 79acd463d3a64aac9d1e700dd5cd075faa9bf78c Mon Sep 17 00:00:00 2001
From: "Roscoe A. Bartlett" <rabartl@sandia.gov>
Date: Fri, 15 May 2020 15:32:42 -0400
Subject: [PATCH 6/8] Kokkos: Extract and use get_gpu() (kokkos/kokkos#3040,
 trilinos/Trilinos#6840)

This fixes the tests KokkosCore_UnitTest_DefaultInit_<x>_MPI_1 that fail when
running with ctest GPU allocation feature that fail becuase they don't run on
GPU device 0 (see kokkos/kokkos#3040).

Note: Since get_ctest_gpu() returns 0 if CTest has not provided anything, it
is safe to always call it.

This new function get_gpu() should really be unit tested on its own.
---
 packages/kokkos/core/src/impl/Kokkos_Core.cpp | 67 ++++++++++---------
 .../unit_test/TestDefaultDeviceTypeInit.hpp   | 10 ++-
 2 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/packages/kokkos/core/src/impl/Kokkos_Core.cpp b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
index 9640e0fccb75..8530e2602ab6 100644
--- a/packages/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/packages/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -166,36 +166,7 @@ int get_ctest_gpu(const char* local_rank_str) {
   return std::atoi(id.c_str());
 }
 
-namespace {
-
-bool is_unsigned_int(const char* str) {
-  const size_t len = strlen(str);
-  for (size_t i = 0; i < len; ++i) {
-    if (!isdigit(str[i])) {
-      return false;
-    }
-  }
-  return true;
-}
-
-void initialize_backends(const InitArguments& args) {
-// This is an experimental setting
-// For KNL in Flat mode this variable should be set, so that
-// memkind allocates high bandwidth memory correctly.
-#ifdef KOKKOS_ENABLE_HBWSPACE
-  setenv("MEMKIND_HBW_NODES", "1", 0);
-#endif
-
-  // Protect declarations, to prevent "unused variable" warnings.
-#if defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_THREADS) || \
-    defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_HPX)
-  const int num_threads = args.num_threads;
-#endif
-#if defined(KOKKOS_ENABLE_THREADS) || defined(KOKKOS_ENABLE_OPENMPTARGET)
-  const int use_numa = args.num_numa;
-#endif
-#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_ROCM) || \
-    defined(KOKKOS_ENABLE_HIP)
+int get_gpu(const InitArguments& args) {
   int use_gpu           = args.device_id;
   const int ndevices    = args.ndevices;
   const int skip_device = args.skip_device;
@@ -208,7 +179,7 @@ void initialize_backends(const InitArguments& args) {
       local_rank_str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK");  // MVAPICH2
     if (!local_rank_str)
       local_rank_str = std::getenv("SLURM_LOCALID");  // SLURM
-
+  
     auto const* ctest_kokkos_device_type =
         std::getenv("CTEST_KOKKOS_DEVICE_TYPE");  // CTest
     auto const* ctest_resource_group_count_str =
@@ -231,6 +202,40 @@ void initialize_backends(const InitArguments& args) {
     // shift assignments over by one so no one is assigned to "skip_device"
     if (use_gpu >= skip_device) ++use_gpu;
   }
+  return use_gpu;
+}
+
+namespace {
+
+bool is_unsigned_int(const char* str) {
+  const size_t len = strlen(str);
+  for (size_t i = 0; i < len; ++i) {
+    if (!isdigit(str[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void initialize_backends(const InitArguments& args) {
+// This is an experimental setting
+// For KNL in Flat mode this variable should be set, so that
+// memkind allocates high bandwidth memory correctly.
+#ifdef KOKKOS_ENABLE_HBWSPACE
+  setenv("MEMKIND_HBW_NODES", "1", 0);
+#endif
+
+  // Protect declarations, to prevent "unused variable" warnings.
+#if defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_THREADS) || \
+    defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_HPX)
+  const int num_threads = args.num_threads;
+#endif
+#if defined(KOKKOS_ENABLE_THREADS) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+  const int use_numa = args.num_numa;
+#endif
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_ROCM) || \
+    defined(KOKKOS_ENABLE_HIP)
+  int use_gpu = get_gpu(args);
 #endif  // defined( KOKKOS_ENABLE_CUDA )
 
 #if defined(KOKKOS_ENABLE_OPENMP)
diff --git a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
index 33c736c5e01a..16a7665062f2 100644
--- a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
+++ b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -52,6 +52,14 @@
 
 #if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
 
+namespace Kokkos {
+namespace Impl {
+
+int get_gpu(const InitArguments& args);
+
+}  // namespace Impl
+}  // namespace Kokkos
+
 namespace Test {
 
 namespace Impl {
@@ -273,7 +281,7 @@ void check_correct_initialization(const Kokkos::InitArguments& argstruct) {
 
     int expected_device = argstruct.device_id;
     if (argstruct.device_id < 0) {
-      expected_device = 0;
+      expected_device = Kokkos::Impl::get_gpu(argstruct);
     }
 
     ASSERT_EQ(expected_device, device);

From 25d209a0e9450f2f2807760c191fc1a5cc60c920 Mon Sep 17 00:00:00 2001
From: "Roscoe A. Bartlett" <rabartl@sandia.gov>
Date: Tue, 19 May 2020 15:02:23 -0600
Subject: [PATCH 7/8] Kokkos: Switch to use Kokkos::Cuda().cuda_device() for
 expected_device (kokkos/kokkos#3040, trilinos/Trilinos#6840)

This seems to also fix the KokkosCore_UnitTest_DefaultInit_<x>_MPI_1 tests
that where failing when running with ctest GPU allocation feature that fail
because they don't run on GPU device 0 (see kokkos/kokkos#3040).
---
 .../core/unit_test/TestDefaultDeviceTypeInit.hpp       | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
index 16a7665062f2..c27f13e956ce 100644
--- a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
+++ b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -52,14 +52,6 @@
 
 #if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
 
-namespace Kokkos {
-namespace Impl {
-
-int get_gpu(const InitArguments& args);
-
-}  // namespace Impl
-}  // namespace Kokkos
-
 namespace Test {
 
 namespace Impl {
@@ -281,7 +273,7 @@ void check_correct_initialization(const Kokkos::InitArguments& argstruct) {
 
     int expected_device = argstruct.device_id;
     if (argstruct.device_id < 0) {
-      expected_device = Kokkos::Impl::get_gpu(argstruct);
+      expected_device = Kokkos::Cuda().cuda_device();
     }
 
     ASSERT_EQ(expected_device, device);

From 88cd5928c2e94aebae1071485a9a08927f3ce593 Mon Sep 17 00:00:00 2001
From: "Roscoe A. Bartlett" <rabartl@sandia.gov>
Date: Tue, 19 May 2020 16:36:55 -0600
Subject: [PATCH 8/8] ATDM: ride: Spread out work over GPUs
 (trilinos/Trilinos#2422)

This also switches to patched CMake 3.17.2 which is needed to support this
feature.
---
 cmake/std/atdm/ride/environment.sh      | 2 +-
 cmake/std/atdm/ride/tweaks/Tweaks.cmake | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/cmake/std/atdm/ride/environment.sh b/cmake/std/atdm/ride/environment.sh
index 7c3b9bde5290..ddd5893b8e0f 100755
--- a/cmake/std/atdm/ride/environment.sh
+++ b/cmake/std/atdm/ride/environment.sh
@@ -225,7 +225,7 @@ export ATDM_CONFIG_NETCDF_LIBS="-L${NETCDF_ROOT}/lib;-L${HDF5_ROOT}/lib;${NETCDF
 
 # Use manually installed cmake and ninja to try to avoid module loading
 # problems (see TRIL-208)
-export PATH=/ascldap/users/rabartl/install/white-ride/cmake-3.11.2/bin:/ascldap/users/rabartl/install/white-ride/ninja-1.8.2/bin:$PATH
+export PATH=/home/atdm-devops-admin/tools/ride/cmake-3.17.2/bin:/home/rabartl/install/white-ride/ninja-1.8.2/bin:$PATH
 
 # Set MPI wrappers
 export MPICC=`which mpicc`
diff --git a/cmake/std/atdm/ride/tweaks/Tweaks.cmake b/cmake/std/atdm/ride/tweaks/Tweaks.cmake
index b3f46e09210b..21cf549f5d74 100644
--- a/cmake/std/atdm/ride/tweaks/Tweaks.cmake
+++ b/cmake/std/atdm/ride/tweaks/Tweaks.cmake
@@ -1,3 +1,11 @@
+#
+# Set up to limit running on GPUs
+#
+
+ATDM_SET_CACHE(Trilinos_AUTOGENERATE_TEST_RESOURCE_FILE ON CACHE BOOL)
+ATDM_SET_CACHE(Trilinos_CUDA_NUM_GPUS 2 CACHE STRING)
+ATDM_SET_CACHE(Trilinos_CUDA_SLOTS_PER_GPU 2 CACHE STRING)
+
 #
 # Disables across multiple builds on 'ride'
 #