diff --git a/tests/rt.sh b/tests/rt.sh index c55eeaaddb..ee899c2f17 100755 --- a/tests/rt.sh +++ b/tests/rt.sh @@ -28,7 +28,6 @@ usage() { echo " -v verbose output" echo " -w for weekly_test, skip comparing baseline results" echo - set -x } [[ $# -eq 0 ]] && usage @@ -155,11 +154,11 @@ update_rtconf() { generate_log() { echo "rt.sh: Generating Regression Testing Log..." - set -x COMPILE_COUNTER=0 FAILED_COMPILES=() TEST_COUNTER=0 FAILED_TESTS=() + SKIPPED_TESTS=() FAILED_TEST_ID=() FAILED_COMPILE_LOGS=() FAILED_TEST_LOGS=() @@ -255,14 +254,14 @@ EOF COMPILE_RESULT="FAILED: UNABLE TO START COMPILE" FAIL_LOG="N/A" elif [[ -f fail_compile_${COMPILE_ID} ]]; then - COMPILE_RESULT="FAILED: UNABLE TO COMPILE" + COMPILE_RESULT="FAILED: UNABLE TO FINISH COMPILE" FAIL_LOG="${LOG_DIR}/compile_${COMPILE_ID}.log" if grep -q "quota" "${LOG_DIR}/compile_${COMPILE_ID}.log"; then COMPILE_RESULT="FAILED: DISK QUOTA ISSUE" FAIL_LOG="${LOG_DIR}/compile_${COMPILE_ID}.log" - elif grep -q "timeout" "${LOG_DIR}/compile_${COMPILE_ID}.log"; then - COMPILE_RESULT="FAILED: TEST TIMED OUT" - FAIL_LOG="${LOG_DIR}/compile_${COMPILE_ID}.log" + elif grep -q "TIME LIMIT" "${RUNDIR_ROOT}/compile_${COMPILE_ID}/err"; then + COMPILE_RESULT="FAILED: COMPILE TIMED OUT" + FAIL_LOG="${RUNDIR_ROOT}/compile_${COMPILE_ID}/err" fi else COMPILE_RESULT="PASS" @@ -330,21 +329,25 @@ EOF RT_TEST_TIME="" RT_TEST_MEM="" if [[ ${CREATE_BASELINE} == true && ${GEN_BASELINE} != "baseline" ]]; then - TEST_RESULT="SKIPPED (TEST DOES NOT GENERATE BASELINE)" + TEST_RESULT="SKIPPED: TEST DOES NOT GENERATE BASELINE" + SKIPPED_TESTS+=("TEST ${TEST_NAME}_${COMPILER}: ${TEST_RESULT}") + elif [[ ${COMPILE_RESULT} =~ FAILED ]]; then + TEST_RESULT="SKIPPED: ASSOCIATED COMPILE FAILED" + SKIPPED_TESTS+=("TEST ${TEST_NAME}_${COMPILER}: ${TEST_RESULT}") elif [[ ! -f "${LOG_DIR}/run_${TEST_NAME}_${COMPILER}.log" ]]; then - TEST_RESULT="FAILED: UNABLE TO START RUN" + TEST_RESULT="FAILED: UNABLE TO START TEST" FAIL_LOG="N/A" elif [[ -f fail_test_${TEST_NAME}_${COMPILER} ]]; then if [[ -f "${LOG_DIR}/rt_${TEST_NAME}_${COMPILER}.log" ]]; then if grep -q "FAIL" "${LOG_DIR}/rt_${TEST_NAME}_${COMPILER}.log"; then - TEST_RESULT="FAILED: UNABLE TO RUN COMPARISON" + TEST_RESULT="FAILED: UNABLE TO COMPLETE COMPARISON" FAIL_LOG="${LOG_DIR}/run_${TEST_NAME}_${COMPILER}.log" # We need to catch a "PASS" in rt_*.log even if a fail_test_* files exists # I am not sure why this can happen. elif grep -q "PASS" "${LOG_DIR}/rt_${TEST_NAME}_${COMPILER}.log"; then TEST_RESULT="PASS" else - TEST_RESULT="FAILED: BASELINE COMPARISON" + TEST_RESULT="FAILED: UNSUCCESSFUL BASELINE COMPARISON" FAIL_LOG="${LOG_DIR}/rt_${TEST_NAME}_${COMPILER}.log" fi else @@ -354,9 +357,9 @@ EOF if grep -q "quota" "${LOG_DIR}/run_${TEST_NAME}_${COMPILER}.log"; then TEST_RESULT="FAILED: DISK QUOTA ISSUE" FAIL_LOG="${LOG_DIR}/run_${TEST_NAME}_${COMPILER}.log" - elif grep -q "timeout" "${LOG_DIR}/run_${TEST_NAME}_${COMPILER}.log"; then + elif grep -q "TIME LIMIT" "${RUNDIR_ROOT}/${TEST_NAME}_${COMPILER}/err"; then TEST_RESULT="FAILED: TEST TIMED OUT" - FAIL_LOG="${LOG_DIR}/run_${TEST_NAME}_${COMPILER}.log" + FAIL_LOG="${RUNDIR_ROOT}/${TEST_NAME}_${COMPILER}/err" fi else TEST_RESULT="PASS" @@ -399,7 +402,7 @@ Starting Date/Time: ${TEST_START_TIME} Ending Date/Time: ${TEST_END_TIME} Total Time: ${elapsed_time} Compiles Completed: $((COMPILE_COUNTER-${#FAILED_COMPILES[@]}))/${COMPILE_COUNTER} -Tests Completed: $((TEST_COUNTER-${#FAILED_TESTS[@]}))/${TEST_COUNTER} +Tests Completed: $((TEST_COUNTER-${#FAILED_TESTS[@]}-${#SKIPPED_TESTS[@]}))/${TEST_COUNTER} EOF # PRINT FAILED COMPILES if [[ "${#FAILED_COMPILES[@]}" -ne "0" ]]; then @@ -674,14 +677,10 @@ done #B&N not run together [[ ${NEW_BASELINES_FILE} != '' && ${RUN_SINGLE_TEST} == true ]] && die "-b and -n options cannot be used at the same time" -[[ -o xtrace ]] && set_x='set -x' || set_x='set +x' - if [[ ${RTVERBOSE} == true ]]; then set -x fi -[[ -o xtrace ]] && set_x='set -x' || set_x='set +x' - if [[ -z "${ACCNR}" ]]; then echo "Please use -a to set group account to use on HPC" exit 1 @@ -694,12 +693,10 @@ echo "Account: ${ACCNR}" case ${MACHINE_ID} in wcoss2|acorn) echo "rt.sh: Setting up WCOSS2/Acorn" - set -x if [[ "${ECFLOW:-false}" == true ]] ; then module load ecflow/5.6.0.13 fi module load intel/19.1.3.304 python/3.8.6 - export colonifnco=":output" # hack DISKNM="/lfs/h2/emc/nems/noscrub/emc.nems/RT" QUEUE="dev" @@ -714,7 +711,6 @@ case ${MACHINE_ID} in ;; gaea) echo "rt.sh: Setting up gaea..." - set -x if [[ "${ROCOTO:-false}" == true ]] ; then module use /ncrc/proj/epic/rocoto/modulefiles module load rocoto @@ -747,7 +743,6 @@ case ${MACHINE_ID} in ;; hera) echo "rt.sh: Setting up hera..." - set -x if [[ "${ROCOTO:-false}" == true ]] ; then module load rocoto ROCOTO_SCHEDULER=slurm @@ -770,7 +765,6 @@ case ${MACHINE_ID} in ;; orion) echo "rt.sh: Setting up orion..." - set -x module load git/2.28.0 module load gcc/10.2.0 module load python/3.9.2 @@ -800,7 +794,6 @@ case ${MACHINE_ID} in ;; hercules) echo "rt.sh: Setting up hercules..." - set -x if [[ "${ROCOTO:-false}" == true ]] ; then module load contrib rocoto ROCOTO_SCHEDULER="slurm" @@ -828,7 +821,6 @@ case ${MACHINE_ID} in ;; jet) echo "rt.sh: Setting up jet..." - set -x CurJetOS=$(lsb_release -is) echo "=======Running on ${CurJetOS}=======" if [[ ${CurJetOS} == "CentOS" ]]; then @@ -861,7 +853,6 @@ case ${MACHINE_ID} in ;; s4) echo "rt.sh: Setting up s4..." - set -x if [[ "${ROCOTO:-false}" == true ]] ; then module load rocoto/1.3.2 ROCOTO_SCHEDULER=slurm @@ -892,7 +883,6 @@ case ${MACHINE_ID} in ;; derecho) echo "rt.sh: Setting up derecho..." - set -x if [[ "${ROCOTO:-false}" == true ]] ; then module use /glade/work/epicufsrt/contrib/derecho/rocoto/modulefiles module load rocoto @@ -930,7 +920,6 @@ case ${MACHINE_ID} in ;; stampede) echo "rt.sh: Setting up stampede..." - set -x export PYTHONPATH= if [[ "${ECFLOW:-false}" == true ]] ; then ECFLOW_START= @@ -948,7 +937,6 @@ case ${MACHINE_ID} in ;; expanse) echo "rt.sh: Setting up expanse..." - set -x export PYTHONPATH= if [[ "${ECFLOW:-false}" == true ]] ; then @@ -965,7 +953,6 @@ case ${MACHINE_ID} in ;; noaacloud) echo "rt.sh: Setting up noaacloud..." - set -x export PATH="/contrib/EPIC/bin:${PATH}" module use /apps/modules/modulefiles @@ -987,7 +974,6 @@ case ${MACHINE_ID} in die "Unknown machine ID, please edit detect_machine.sh file" ;; esac -eval "${set_x}" mkdir -p "${STMP}/${USER}" @@ -1374,5 +1360,4 @@ fi ## Lets verify all tests were run and that they passed generate_log -eval "${set_x}" echo "******Regression Testing Script Completed******" diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index 9f08aad3b4..403ecc4506 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -15,9 +15,7 @@ ECFLOW_RUNNING=false jobid=0 function compute_petbounds_and_tasks() { - echo "rt_utils.sh: ${TEST_ID}: Computing PET bounds and tasks." - [[ -o xtrace ]] && set_x='set -x' || set_x='set +x' - set +x + # each test MUST define ${COMPONENT}_tasks variable for all components it is using # and MUST NOT define those that it's not using or set the value to 0. @@ -96,13 +94,10 @@ function compute_petbounds_and_tasks() { # TASKS is now set to UFS_TASKS export TASKS=${UFS_tasks} - eval "${set_x}" } interrupt_job() { - echo "rt_utils.sh: Job ${jobid} interupted" - set -x - #echo "run_util.sh: interrupt_job called | Job#: ${jobid}" + echo "rt_utils.sh: Job ${jobid} interrupted" case ${SCHEDULER} in pbs) qdel "${jobid}" @@ -120,9 +115,6 @@ submit_and_wait() { echo "rt_utils.sh: Submitting job on scheduler: ${SCHEDULER}" [[ -z $1 ]] && exit 1 - [[ -o xtrace ]] && set_x='set -x' || set_x='set +x' - set +x - local -r job_card=$1 ROCOTO=${ROCOTO:-false} @@ -156,9 +148,9 @@ submit_and_wait() { do case ${SCHEDULER} in pbs) - set +e + set +e job_info=$( qstat "${jobid}" ) - set -e + set -e ;; slurm) job_info=$( squeue -u "${USER}" -j "${jobid}" ) @@ -185,9 +177,9 @@ submit_and_wait() { do case ${SCHEDULER} in pbs) - set +e + set +e job_info=$( qstat "${jobid}" ) - set -e + set -e ;; slurm) job_info=$( squeue -u "${USER}" -j "${jobid}" ) @@ -228,7 +220,6 @@ submit_and_wait() { echo "rt_utils.sh: *** WARNING ***: Job in a HELD state. Might want to stop manually." ;; #fail/completed cases - #pbs: E #slurm: F/FAILED TO/TIMEOUT CA/CANCELLED F|TO|CA|FAILED|TIMEOUT|CANCELLED) echo "rt_utils.sh: !!!!!!!!!!JOB TERMINATED!!!!!!!!!!" @@ -253,14 +244,10 @@ submit_and_wait() { (( n=n+1 )) sleep 60 & wait $! done - - eval "${set_x}" } check_results() { echo "rt_utils.sh: Checking results of the regression test: ${TEST_ID}" - [[ -o xtrace ]] && set_x='set -x' || set_x='set +x' - set +x ROCOTO=${ROCOTO:-false} ECFLOW=${ECFLOW:-false} @@ -271,7 +258,7 @@ check_results() { #sleep 60 { - echo + echo echo "baseline dir = ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}" echo "working dir = ${RUNDIR}" echo "Checking test ${TEST_ID} results ...." @@ -391,8 +378,6 @@ check_results() { exit 1 fi fi - - eval "${set_x}" } @@ -411,7 +396,7 @@ kill_job() { rocoto_create_compile_task() { echo "rt_utils.sh: ${COMPILE_ID}: Creating ROCOTO compile task." - #new_compile=true + new_compile=true if [[ ${in_metatask} == true ]]; then in_metatask=false echo " " >> "${ROCOTO_XML}" @@ -528,7 +513,6 @@ rocoto_kill() { rocoto_step() { echo "rt_utils.sh: Running one iteration of rocotorun and rocotostat..." - set -e echo "Unknown" > rocoto_workflow.state # Run one iteration of rocotorun and rocotostat. ${ROCOTORUN} -v 10 -w "${ROCOTO_XML}" -d "${ROCOTO_DB}" @@ -566,19 +550,15 @@ rocoto_run() { set -e if [[ "${state:-Unknown}" == Done ]] ; then - set +x echo "Rocoto workflow has completed." - set -x return 0 elif [[ ${result} == 0 ]] ; then break # rocoto_step succeeded elif (( now_time-start_time > max_time || step_attempts >= max_step_attempts )) ; then - set +x hostnamein=$(hostname) echo "Rocoto commands have failed ${step_attempts} times, for $(( (now_time-start_time+30)/60 )) minutes." echo "There may be something wrong with the ${hostnamein} node or the batch system." echo "I'm giving up. Sorry." - set -x return 2 fi sleep $(( naptime * 2**((step_attempts-1)%4) * RANDOM/32767 )) @@ -592,7 +572,6 @@ ecflow_create_compile_task() { echo "rt_utils.sh: ${COMPILE_ID}: Creating ECFLOW compile task" export new_compile=true - cat << EOF > "${ECFLOW_RUN}/${ECFLOW_SUITE}/compile_${COMPILE_ID}.ecf" %include ${PATHRT}/run_compile.sh "${PATHRT}" "${RUNDIR_ROOT}" "${MAKE_OPT}" "${COMPILE_ID}" > "${LOG_DIR}/compile_${COMPILE_ID}.log" 2>&1 & @@ -625,7 +604,7 @@ EOF else echo " trigger compile_${COMPILE_ID} == complete" >> "${ECFLOW_RUN}/${ECFLOW_SUITE}.def" fi - + } ecflow_run() { @@ -633,7 +612,7 @@ ecflow_run() { # NOTE: ECFLOW IS NOT SAFE TO RUN WITH set -e, PLEASE AVOID #ECF_HOST="${ECF_HOST:-${HOSTNAME}}" - + # Make sure ECF_HOST and ECF_PORT are set/ready on systems that have an # explicit ecflow node if [[ ${MACHINE_ID} == wcoss2 || ${MACHINE_ID} == acorn ]]; then @@ -665,11 +644,11 @@ ecflow_run() { ecflow_client --ping --host="${ECF_HOST}" --port="${ECF_PORT}" not_running=$? set -e - + if [[ ${not_running} -eq 1 ]]; then echo "rt_utils.sh: ecflow_server is not running on ${ECF_HOST}:${ECF_PORT}" echo "rt_utils.sh: attempting to start ecflow_server..." - + save_traps=$(trap) trap "" SIGINT # Ignore INT signal during ecflow startup case ${MACHINE_ID} in @@ -679,7 +658,7 @@ ecflow_run() { ;; *) ${ECFLOW_START} -p "${ECF_PORT}" -d "${RUNDIR_ROOT}/ecflow_server" - ;; + ;; esac ECFLOW_RUNNING=true @@ -689,7 +668,7 @@ ecflow_run() { ecflow_client --ping --host="${ECF_HOST}" --port="${ECF_PORT}" not_running=$? set -e - + if [[ ${not_running} -eq 1 ]]; then echo "rt_utils.sh: ERROR -- Failure to start ecflow. Exiting..." exit 1 @@ -698,7 +677,7 @@ ecflow_run() { echo "rt_utils.sh: Confirmed: ecflow_server is running on ${ECF_HOST}:${ECF_PORT}" ECFLOW_RUNNING=true fi - + echo "rt_utils.sh: Starting ECFLOW tasks..." set +e ecflow_client --load="${ECFLOW_RUN}/${ECFLOW_SUITE}.def" --host="${ECF_HOST}" --port="${ECF_PORT}" @@ -712,6 +691,7 @@ ecflow_run() { max_active_tasks=$( grep "task " <<< "${max_active_tasks}" ) max_active_tasks=$( grep -cP 'state:active|state:submitted|state:queued' <<< "${max_active_tasks}" ) echo "rt_utils.sh: Total number of tasks processed -- ${max_active_tasks}" + prev_active_tasks=${active_tasks} while [[ "${active_tasks}" -ne 0 ]] do sleep 10 & wait $! @@ -720,7 +700,13 @@ ecflow_run() { active_tasks=$( grep "task " <<< "${active_tasks}" ) active_tasks=$( grep -cP 'state:active|state:submitted|state:queued' <<< "${active_tasks}" ) set -e - echo "ECFLOW Tasks Remaining: ${active_tasks}/${max_active_tasks}" + if [[ ${active_tasks} -ne ${prev_active_tasks} ]]; then + echo + echo -n "ECFLOW Tasks Remaining: ${active_tasks}/${max_active_tasks} " + prev_active_tasks=${active_tasks} + else + echo -n "." + fi "${PATHRT}/abort_dep_tasks.py" done