Skip to content

Commit

Permalink
switch ATDM mutrino build to use sbatch
Browse files Browse the repository at this point in the history
  • Loading branch information
fryeguy52 committed Jun 13, 2018
1 parent 767c0db commit ff791f8
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 4 deletions.
11 changes: 8 additions & 3 deletions cmake/ctest/drivers/atdm/mutrino/local-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ source $WORKSPACE/Trilinos/cmake/ctest/drivers/atdm/ctest-s-driver-config-build.

set -x

/usr/local/bin/salloc -N 1 -p standard -J $JOB_NAME \
--time=${SALLOC_CTEST_TIME_LIMIT_MINUTES} \
$WORKSPACE/Trilinos/cmake/ctest/drivers/atdm/ctest-s-driver-test.sh
atdm_run_script_on_compute_node \
$WORKSPACE/Trilinos/cmake/ctest/drivers/atdm/ctest-s-driver-test.sh \
$PWD/ctest-s-driver-test.out \
${SALLOC_CTEST_TIME_LIMIT_MINUTES}

#/usr/local/bin/salloc -N 1 -p standard -J $JOB_NAME \
# --time=${SALLOC_CTEST_TIME_LIMIT_MINUTES} \
# $WORKSPACE/Trilinos/cmake/ctest/drivers/atdm/ctest-s-driver-test.sh
103 changes: 102 additions & 1 deletion cmake/std/atdm/mutrino/environment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ module use /projects/EMPIRE/mutrino/tpls/hsw/modulefiles
export ATDM_CONFIG_MPI_EXEC="/opt/slurm/bin/srun"

# srun does not accept "-np" for # of processors
export ATDM_CONFIG_MPI_EXEC_NUMPROCS_FLAG="-n"
export ATDM_CONFIG_MPI_EXEC_NUMPROCS_FLAG="--ntasks"
export ATDM_CONFIG_MPI_PRE_FLAGS="--mpi=pmi2;--ntasks-per-node;36"
export ATDM_CONFIG_KOKKOS_ARCH=HSW
export ATDM_CONFIG_CTEST_PARALLEL_LEVEL=16

Expand Down Expand Up @@ -55,3 +56,103 @@ export ATDM_CONFIG_NETCDF_LIBS="-L${BOOST_ROOT}/lib;-L${NETCDF_ROOT}/lib;-L${PNE

export ATDM_CONFIG_MPI_POST_FLAG="-c 4"
export ATDM_CONFIG_COMPLETED_ENV_SETUP=TRUE

#
# Run a script on the compute node send STDOUT and STDERR output to a file
# while also echo output to the console. The primary purpose is to run the
# tests on the compute node.
#
# Usage:
#
# atdm_run_script_on_comput_node <script_to_run> <output_file> \
# [<timeout>] [<account>]
#
# If <timeout> and/or <account> are not given, then defaults are provided that
# work for the Jenkins driver process.
#
# In this case, sbatch is used to run the script but it also sends ouptut to
# STDOUT in real-time while it is running in addition to writing to the
# <outout_file>. The job name for the sbatch script is taken from the env var
# 'JOB_NAME'. This works for local builds since JOB_NAME.
#
# Note that you can pass in the script to run with arguments such as with
# "<some-script> <arg1> <arg2>" and it will work. But note that this has to
# be bash script that 'sbatch' can copy and run form a temp location and it
# still has to work. So the script has to use absolute directory paths, not
# relative paths or asume sym links, etc.
#
function atdm_run_script_on_compute_node {

set +x

script_to_run=$1
output_file=$2
timeout_input=$3
account_input=$4

echo
echo "***"
echo "*** atdm_run_script_on_compute_node '${script_to_run}' '${output_file}' '${timeout_input}' '${account_input}'"
echo "***"
echo

if [ "${timeout_input}" == "" ] ; then
timeout=1:30:00
else
timeout=${timeout_input}
fi

if [ "${account_input}" == "" ] ; then
account=fy150090
else
account=${account_input}
fi

if [ -e $output_file ] ; then
echo "Remove existing file $output_file"
rm $output_file
fi
echo "Create empty file $output_file"
touch $output_file

echo
echo "Running '$script_to_run' using sbatch in the background ..."
set -x
# sbatch --output=$output_file --wait -N1 --time=${timeout} \
# -J $JOB_NAME --account=${account} ${script_to_run} &
# SBATCH_PID=$!
sbatch --output=$output_file --wait -N1 --time=${timeout} \
-J $JOB_NAME ${script_to_run} &
SBATCH_PID=$!
set +x

echo
echo "Tailing output file $output_file in the background ..."
set -x
tail -f $output_file &
TAIL_BID=$!
set +x

echo
echo "Waiting for SBATCH_PID=$SBATCH_PID ..."
wait $SBATCH_PID

echo
echo "Kill TAIL_BID=$TAIL_BID"
kill -s 9 $TAIL_BID

echo
echo "Finished running ${script_to_run}!"
echo

}

# NOTE: The above function is implemented in this way using 'sbatch' so that
# we can avoid using 'salloc' which is belived to cause ORTE errors. But we
# still want to see live ouput from the script so that we can report it on
# Jenkins. Therefore, the above approach is to use 'sbatch' and write its
# output to a known file-name. Then, we use `tail -f` to print that file as
# it gets filled in from the 'sbatch' command. The 'sbatch' command is run
# with --wait but is backgrouned to allow this to happen. Then we wait for
# the 'sbatch' command to complete and then we kill the 'tail -f' command.
# That might seem overly complex but that gets the job done.

0 comments on commit ff791f8

Please sign in to comment.