From 1aaef05d317cd1eec548ef2b9842679c531cef8b Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Tue, 13 Feb 2024 18:15:59 -0500 Subject: [PATCH] Jenkins Pipeline updates for Canceling Jobs (#2307) Tuning updates for Jenkins Pipeline : - Added short circuit for all parallel runs of cases on error of any - Fixed canceling of all scheduled jobs on first case error - Added feature to save error log files to Jenkins Archive facility on fail --- Jenkinsfile | 71 +++++++++++++++------------ ci/cases/pr/C48mx500_3DVarAOWCDA.yaml | 3 +- ci/scripts/run-check_ci.sh | 16 +++--- ci/scripts/utils/ci_utils.sh | 8 +++ 4 files changed, 59 insertions(+), 39 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9f3688ea6c..be62a20512 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -9,7 +9,7 @@ pipeline { options { skipDefaultCheckout() - buildDiscarder(logRotator(numToKeepStr: '2')) + parallelsAlwaysFailFast() } stages { // This initial stage is used to get the Machine name from the GitHub labels on the PR @@ -45,14 +45,14 @@ pipeline { properties([parameters([[$class: 'NodeParameterDefinition', allowedSlaves: ['built-in', 'Hera-EMC', 'Orion-EMC'], defaultSlaves: ['built-in'], name: '', nodeEligibility: [$class: 'AllNodeEligibility'], triggerIfResult: 'allCases']])]) HOME = "${WORKSPACE}/TESTDIR" commonworkspace = "${WORKSPACE}" - sh(script: "mkdir -p ${HOME}/RUNTESTS", returnStatus: true) + sh(script: "mkdir -p ${HOME}/RUNTESTS") pullRequest.addLabel("CI-${Machine}-Building") if (pullRequest.labels.any { value -> value.matches("CI-${Machine}-Ready") }) { pullRequest.removeLabel("CI-${Machine}-Ready") + } } } } - } stage('Build System') { matrix { @@ -71,35 +71,41 @@ pipeline { steps { script { def HOMEgfs = "${HOME}/${system}" // local HOMEgfs is used to build the system on per system basis under the common workspace HOME - sh(script: "mkdir -p ${HOMEgfs}", returnStatus: true) + sh(script: "mkdir -p ${HOMEgfs}") ws(HOMEgfs) { env.MACHINE_ID = machine // MACHINE_ID is used in the build scripts to determine the machine and is added to the shell environment if (fileExists("${HOMEgfs}/sorc/BUILT_semaphor")) { // if the system is already built, skip the build in the case of re-runs sh(script: "cat ${HOMEgfs}/sorc/BUILT_semaphor", returnStdout: true).trim() // TODO: and user configurable control to manage build semphore - ws(commonworkspace) { pullRequest.comment("Cloned PR already built (or build skipped) on ${machine} in directory ${HOMEgfs}") } + pullRequest.comment("Cloned PR already built (or build skipped) on ${machine} in directory ${HOMEgfs}
Still doing a checkout to get the latest changes") + sh(script: 'source workflow/gw_setup.sh; git pull --recurse-submodules') + dir('sorc') { + sh(script: './link_workflow.sh') + } } else { checkout scm - sh(script: 'source workflow/gw_setup.sh;which git;git --version;git submodule update --init --recursive', returnStatus: true) + sh(script: 'source workflow/gw_setup.sh;which git;git --version;git submodule update --init --recursive') def builds_file = readYaml file: 'ci/cases/yamls/build.yaml' def build_args_list = builds_file['builds'] def build_args = build_args_list[system].join(' ').trim().replaceAll('null', '') dir("${HOMEgfs}/sorc") { - sh(script: "${build_args}", returnStatus: true) - sh(script: './link_workflow.sh', returnStatus: true) - sh(script: "echo ${HOMEgfs} > BUILT_semaphor", returnStatus: true) + sh(script: "${build_args}") + sh(script: './link_workflow.sh') + sh(script: "echo ${HOMEgfs} > BUILT_semaphor") } } - if (pullRequest.labels.any { value -> value.matches("CI-${Machine}-Building") }) { - pullRequest.removeLabel("CI-${Machine}-Building") - } - pullRequest.addLabel("CI-${Machine}-Running") - } + if (env.CHANGE_ID && system == 'gfs') { + if (pullRequest.labels.any { value -> value.matches("CI-${Machine}-Building") }) { + pullRequest.removeLabel("CI-${Machine}-Building") + } + pullRequest.addLabel("CI-${Machine}-Running") + } + } + } } } } } } -} stage('Run Tests') { matrix { @@ -108,19 +114,19 @@ pipeline { axis { name 'Case' // TODO add dynamic list of cases from env vars (needs addtional plugins) - values 'C48_ATM', 'C48_S2SWA_gefs', 'C48_S2SW', 'C96_atm3DVar', 'C48mx500_3DVarAOWCDA', 'C96C48_hybatmDA', 'C96_atmsnowDA' + values 'C48_ATM', 'C48_S2SWA_gefs', 'C48_S2SW', 'C96_atm3DVar', 'C96C48_hybatmDA', 'C96_atmsnowDA' // 'C48mx500_3DVarAOWCDA' } } stages { stage('Create Experiment') { steps { script { - sh(script: "sed -n '/{.*}/!p' ${HOME}/gfs/ci/cases/pr/${Case}.yaml > ${HOME}/gfs/ci/cases/pr/${Case}.yaml.tmp", returnStatus: true) + sh(script: "sed -n '/{.*}/!p' ${HOME}/gfs/ci/cases/pr/${Case}.yaml > ${HOME}/gfs/ci/cases/pr/${Case}.yaml.tmp") def yaml_case = readYaml file: "${HOME}/gfs/ci/cases/pr/${Case}.yaml.tmp" system = yaml_case.experiment.system def HOMEgfs = "${HOME}/${system}" // local HOMEgfs is used to populate the XML on per system basis env.RUNTESTS = "${HOME}/RUNTESTS" - sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh create_experiment ${HOMEgfs}/ci/cases/pr/${Case}.yaml", returnStatus: true) + sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh create_experiment ${HOMEgfs}/ci/cases/pr/${Case}.yaml") } } } @@ -130,16 +136,27 @@ pipeline { HOMEgfs = "${HOME}/gfs" // common HOMEgfs is used to launch the scripts that run the experiments ws(HOMEgfs) { pslot = sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh get_pslot ${HOME}/RUNTESTS ${Case}", returnStdout: true).trim() - pullRequest.comment("**Running** experiment: ${Case} on ${Machine}
With the experiment in directory:
`${HOME}/RUNTESTS/${pslot}`") - try { - sh(script: "${HOMEgfs}/ci/scripts/run-check_ci.sh ${HOME} ${pslot}", returnStatus: true) - } catch (Exception e) { + // pullRequest.comment("**Running** experiment: ${Case} on ${Machine}
With the experiment in directory:
`${HOME}/RUNTESTS/${pslot}`") + err = sh(script: "${HOMEgfs}/ci/scripts/run-check_ci.sh ${HOME} ${pslot}") + if (err != 0) { pullRequest.comment("**FAILURE** running experiment: ${Case} on ${Machine}") + sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh cancel_all_batch_jobs ${HOME}/RUNTESTS") + ws(HOME) { + if (fileExists('RUNTESTS/error.logs')) { + def fileContent = readFile 'RUNTESTS/error.logs' + def lines = fileContent.readLines() + for (line in lines) { + echo "archiving: ${line}" + archiveArtifacts artifacts: "${line}", fingerprint: true + } + } + } error("Failed to run experiments ${Case} on ${Machine}") } - pullRequest.comment("**SUCCESS** running experiment: ${Case} on ${Machine}") + // pullRequest.comment("**SUCCESS** running experiment: ${Case} on ${Machine}") } } + } } } @@ -175,14 +192,6 @@ pipeline { def timestamp = new Date().format('MM dd HH:mm:ss', TimeZone.getTimeZone('America/New_York')) pullRequest.comment("**CI FAILED** ${Machine} at ${timestamp}
Built and ran in directory `${HOME}`") } - if (fileExists('${HOME}/RUNTESTS/ci.log')) { - def fileContent = readFile '${HOME}/RUNTESTS/ci.log' - fileContent.eachLine { line -> - if (line.contains('.log')) { - archiveArtifacts artifacts: "${line}", fingerprint: true - } - } - } } } } diff --git a/ci/cases/pr/C48mx500_3DVarAOWCDA.yaml b/ci/cases/pr/C48mx500_3DVarAOWCDA.yaml index b972d3a445..6e9fc6d3de 100644 --- a/ci/cases/pr/C48mx500_3DVarAOWCDA.yaml +++ b/ci/cases/pr/C48mx500_3DVarAOWCDA.yaml @@ -17,6 +17,7 @@ arguments: start: warm yaml: {{ HOMEgfs }}/ci/cases/yamls/soca_gfs_defaults_ci.yaml -skip_ci_on_hosts: +skip_ci_on_host: - orion + - hera - hercules diff --git a/ci/scripts/run-check_ci.sh b/ci/scripts/run-check_ci.sh index f98f434462..8e1e927050 100755 --- a/ci/scripts/run-check_ci.sh +++ b/ci/scripts/run-check_ci.sh @@ -25,6 +25,7 @@ pslot=${2:-${pslot:-?}} # Name of the experiment being tested by this scr # TODO: Make this configurable (for now all scripts run from gfs for CI at runtime) HOMEgfs="${TEST_DIR}/gfs" RUNTESTS="${TEST_DIR}/RUNTESTS" +run_check_logfile="${RUNTESTS}/ci-run_check.log" # Source modules and setup logging echo "Source modules." @@ -77,15 +78,16 @@ while true; do { echo "Experiment ${pslot} Terminated with ${num_failed} tasks failed at $(date)" || true echo "Experiment ${pslot} Terminated: *FAILED*" - } >> "${RUNTESTS}/ci.log" - + } | tee -a "${run_check_logfile}" error_logs=$(rocotostat -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs rocotocheck -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true { echo "Error logs:" echo "${error_logs}" - } >> "${RUNTESTS}/ci.log" - sed -i "s/\`\`\`//2g" "${RUNTESTS}/ci.log" - sacct --format=jobid,jobname%35,WorkDir%100,stat | grep "${pslot}" | grep "${pr}\/RUNTESTS" | awk '{print $1}' | xargs scancel || true + } | tee -a "${run_check_logfile}" + # rm -f "${RUNTESTS}/error.logs" + for log in ${error_logs}; do + echo "RUNTESTS${log#*RUNTESTS}" >> "${RUNTESTS}/error.logs" + done rc=1 break fi @@ -95,8 +97,7 @@ while true; do echo "Experiment ${pslot} Completed at $(date)" || true echo "with ${num_succeeded} successfully completed jobs" || true echo "Experiment ${pslot} Completed: *SUCCESS*" - } >> "${RUNTESTS}/ci.log" - sed -i "s/\`\`\`//2g" "${RUNTESTS}/ci.log" + } | tee -a "${run_check_logfile}" rc=0 break fi @@ -107,3 +108,4 @@ while true; do done exit "${rc}" + diff --git a/ci/scripts/utils/ci_utils.sh b/ci/scripts/utils/ci_utils.sh index 6f2426c388..ce2e039307 100755 --- a/ci/scripts/utils/ci_utils.sh +++ b/ci/scripts/utils/ci_utils.sh @@ -102,6 +102,14 @@ function get_pslot () { } +function cancel_all_batch_jobs () { + local RUNTESTS="${1}" + pslot_list=$(get_pslot_list "${RUNTESTS}") + for pslot in ${pslot_list}; do + cancel_batch_jobs "${pslot}" + done +} + function create_experiment () { local yaml_config="${1}"