[develop] Feature cicd scorecard metric (#1079)

* Update CI/CD scripts to include skill-score metric output so that follow-on metrics collection can display it on metrics Dashboard. * Update Jenkinsfile to fix post() section that calls follow-on metrics collection job so that it is only called once at the end, regardless if any platforms builds or tests fail independently. * Update the Jenkinsfile to skip platform Nodes that appear to be offline, rather than put them in the launch queue. This also means we can re-add the NOAAcloud platforms to the list of possible Nodes to attempt. The will be skipped if they are not online. * Update Jenkinsfile to include timeout limits on Build stage and Test stage, so they don't run forever. * Update Jenkinsfile to allow seeing timestamps in the Jenkins console log. --------- Co-authored-by: EdwardSnyder-NOAA <[email protected]>
ufs-community · Apr 25, 2024 · 527b242 · 527b242
1 parent 744bf17
commit 527b242
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 34 deletions.
diff --git a/.cicd/Jenkinsfile b/.cicd/Jenkinsfile
@@ -5,6 +5,8 @@ pipeline {
         disableConcurrentBuilds()
         overrideIndexTriggers(false)
         skipDefaultCheckout(true)
+        timestamps()
+        timeout(time: 12, unit: 'HOURS')
     }
 
     parameters {
@@ -74,6 +76,11 @@ pipeline {
                 // Run on all platform/compiler combinations by default or build and test only on the platform(s) and
                 // compiler(s) specified by SRW_PLATFORM_FILTER and SRW_COMPILER_FILTER
                 when {
+                    beforeAgent true
+                    expression {
+                        return nodesByLabel(env.SRW_PLATFORM).size() > 0
+                    }
+
                     allOf {
                         anyOf {
                             expression { params.SRW_PLATFORM_FILTER == 'all' }
@@ -137,6 +144,7 @@ pipeline {
                             sh "STAGE_NAME=${env.STAGE_NAME} " + 'bash --login "${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/disk_usage.sh"'
                             }
                         }
+
                         post {
                             always {
                                 s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-time-srw_init.json", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
@@ -147,6 +155,10 @@ pipeline {
 
                     // Run the unified build script; if successful create a tarball of the build and upload to S3
                     stage('Build') {
+                        options {
+                            timeout(time: 4, unit: 'HOURS')
+                        }
+
                         steps {
                             dir ("${env.SRW_PLATFORM}") {
                             echo "${env.STAGE_NAME} SRW (${env.SRW_COMPILER}) on ${env.SRW_PLATFORM} (using ${env.WORKSPACE}/${env.SRW_PLATFORM})"
@@ -183,6 +195,7 @@ pipeline {
                         environment {
                             TASK_DEPTH = "${env.SRW_WRAPPER_TASK_DEPTH}"
                         }
+
                         steps {
                             dir ("${env.SRW_PLATFORM}") {
                             echo "Running ${TASK_DEPTH} simple workflow script task tests on ${env.SRW_PLATFORM} (using ${env.WORKSPACE}/${env.SRW_PLATFORM})"
@@ -193,6 +206,10 @@ pipeline {
 
                     // Run the unified test script
                     stage('Test') {
+                        options {
+                            timeout(time: 8, unit: 'HOURS')
+                        }
+
                         environment {
                             SRW_WE2E_EXPERIMENT_BASE_DIR = "${env.WORKSPACE}/${env.SRW_PLATFORM}/expt_dirs"
                         }
@@ -228,39 +245,37 @@ pipeline {
                         post {
                             success {
                                 s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/*_test_results-*-*.txt", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false], [bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/we2e_test_logs-${env.SRW_PLATFORM}-${env.SRW_COMPILER}.tgz", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
+                                s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/*-skill-score.txt", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
                             }
                             always {
                                 s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-time-srw_test.json", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
                                 s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-disk-usage${env.STAGE_NAME}.csv", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
                                 // Remove the data sets from the experiments directory to conserve disk space
                                 sh 'find "${SRW_WE2E_EXPERIMENT_BASE_DIR}" -regextype posix-extended -regex "^.*(orog|[0-9]{10})$" -type d | xargs rm -rf'
-                          }
-                        }
-                    }
-
-                    stage('Metrics') {
-                        steps {
-                            script {
-                                CI_BRANCH_NAME=env.JOB_BASE_NAME.replace("%2F","%252F")
-                                echo "Triggering job for branch ${CI_BRANCH_NAME}/${env.BUILD_NUMBER} ..."
-                                build job: '/ufs-srweather-app/ufs-srw-metrics', parameters: [
-                                    string(name: 'CI_JOB_NAME', value: "ufs-srweather-app/metrics"),
-                                    string(name: 'CI_BUILD_NUMBER', value: "${CI_BRANCH_NAME}/${env.BUILD_NUMBER}")
-                                ], wait: false
                             }
                         }
                     }
                 }
             }
         }
     }
+    // end of stages{}
 
-    // Uncomment the following block to re-enable PW clusters
-    /*
     post {
         always {
-            // Stop any Parallel Works clusters that were started during the pipeline execution
             script {
+                // Trigger another job to collect all build statistics
+                CI_JOB_NAME=env.JOB_NAME.replace("/${env.JOB_BASE_NAME}","")
+                CI_BRANCH_NAME=env.JOB_BASE_NAME.replace("%2F","%252F")
+                echo "post: Triggering ufs-srweather-app/ufs-srw-metrics job for ${CI_JOB_NAME} on branch build ${CI_BRANCH_NAME}/${env.BUILD_NUMBER} ..."
+                build job: '/ufs-srweather-app/ufs-srw-metrics', parameters: [
+                        string(name: 'CI_JOB_NAME', value: "${CI_JOB_NAME}"),
+                        string(name: 'CI_BUILD_NUMBER', value: "${CI_BRANCH_NAME}/${env.BUILD_NUMBER}")
+                ], wait: false
+
+    // Uncomment the following block to re-enable PW clusters
+    /*
+            // Stop any Parallel Works clusters that were started during the pipeline execution
                 // def pw_clusters = ['pclusternoaav2use1', 'azclusternoaav2eus1', 'gclusternoaav2usc1']
                 def pw_clusters = ['pclusternoaav2use1']
                 def clusters = []
@@ -279,8 +294,8 @@ pipeline {
                     // PW_CLUSTER_NAME parameter
                     build job: 'parallel-works-jenkins-client/stop-cluster', parameters: [string(name: 'PW_CLUSTER_NAME', value: clusters[i])]
                 }
+    */          
             }
         }
     }
-    */
 }
diff --git a/.cicd/scripts/srw_metric_example.sh → .cicd/scripts/srw_metric.sh b/.cicd/scripts/srw_metric_example.sh → .cicd/scripts/srw_metric.sh
@@ -56,17 +56,17 @@ else
 fi
 
 # Test directories
-we2e_experiment_base_dir="${workspace}/../expt_dirs/metric_test"
-we2e_test_dir="${workspace}/tests/WE2E"
-we2e_test_name="grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_WoFS_v0"
+we2e_experiment_base_dir="${we2e_experiment_base_dir:=${workspace}/../expt_dirs/metric_test}"
+we2e_test_dir="${we2e_test_dir:=${workspace}/tests/WE2E}"
+we2e_test_name="${test_type:=grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_WoFS_v0}"
 
 pwd
 
 # Setup the build environment
 declare srw_compiler
 srw_compiler=${SRW_COMPILER} 
-source etc/lmod-setup.sh ${platform,,}
-module use modulefiles
+source ${workspace}/etc/lmod-setup.sh ${platform,,}
+module use ${workspace}/modulefiles
 module load build_${platform,,}_${srw_compiler}
 
 # Build srw
@@ -99,7 +99,7 @@ if [[ ${RUN_STAT_ANLY_OPT} == true ]]; then
     rm -rf ${workspace}/Indy-Severe-Weather/
     # Check if metprd data exists locally otherwise get it from S3
     TEST_EXTRN_MDL_SOURCE_BASEDIR=$(grep TEST_EXTRN_MDL_SOURCE_BASEDIR ${workspace}/ush/machine/${SRW_PLATFORM}.yaml | awk '{print $NF}')
-    if [[ ! -d $(dirname ${TEST_EXTRN_MDL_SOURCE_BASEDIR})/metprd/point_stat ]] ; then
+    if [[ -d $(dirname ${TEST_EXTRN_MDL_SOURCE_BASEDIR})/metprd/point_stat ]] ; then
         mkdir -p Indy-Severe-Weather/metprd/point_stat
         cp -rp $(dirname ${TEST_EXTRN_MDL_SOURCE_BASEDIR})/metprd/point_stat Indy-Severe-Weather/metprd
     elif [[ -f Indy-Severe-Weather.tgz ]]; then
@@ -108,7 +108,7 @@ if [[ ${RUN_STAT_ANLY_OPT} == true ]]; then
         wget https://noaa-ufs-srw-pds.s3.amazonaws.com/sample_cases/release-public-v2.1.0/Indy-Severe-Weather.tgz
         tar xvfz Indy-Severe-Weather.tgz
     fi
-    [[ -f skill-score.txt ]] && rm skill-score.txt
+    [[ -f ${platform,,}-${srw_compiler}-skill-score.txt ]] && rm ${platform,,}-${srw_compiler}-skill-score.txt
     # Skill score index is computed over several terms that are defined in parm/metplus/STATAnalysisConfig_skill_score. 
     # It is computed by aggregating the output from earlier runs of the Point-Stat and/or Grid-Stat tools over one or more cases.
     # In this example, skill score index is a weighted average of 4 skill scores of RMSE statistics for wind speed, dew point temperature, 
@@ -126,15 +126,15 @@ if [[ ${RUN_STAT_ANLY_OPT} == true ]]; then
        sed -i 's|--load("conda")|load("conda")|g' ${workspace}/modulefiles/tasks/${platform,,}/run_vx.local.lua
     fi
     # Run stat_analysis
-    stat_analysis -config parm/metplus/STATAnalysisConfig_skill_score -lookin ${workspace}/Indy-Severe-Weather/metprd/point_stat -v 2 -out skill-score.txt
+    stat_analysis -config parm/metplus/STATAnalysisConfig_skill_score -lookin ${workspace}/Indy-Severe-Weather/metprd/point_stat -v 2 -out ${platform,,}-${srw_compiler}-skill-score.txt
 
     # check skill-score.txt
-    cat skill-score.txt
+    cat ${platform,,}-${srw_compiler}-skill-score.txt
 
     # get skill-score (SS_INDEX) and check if it is significantly smaller than 1.0
     # A value greater than 1.0 indicates that the forecast model outperforms the reference, 
     # while a value less than 1.0 indicates that the reference outperforms the forecast.
-    tmp_string=$( tail -2 skill-score.txt | head -1 )
+    tmp_string=$( tail -2 ${platform,,}-${srw_compiler}-skill-score.txt | head -1 )
     SS_INDEX=$(echo $tmp_string | awk -F " " '{print $NF}')
     echo "Skill Score: ${SS_INDEX}"
     if [[ ${SS_INDEX} < "0.700" ]]; then

diff --git a/.cicd/scripts/srw_test.sh b/.cicd/scripts/srw_test.sh
@@ -11,7 +11,7 @@ script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" > /dev/null 2>&1 && pwd)
 # Get repository root from Jenkins WORKSPACE variable if set, otherwise, set
 # relative to script directory.
 declare workspace
-if [[ -n "${WORKSPACE}/${SRW_PLATFORM}" ]]; then
+if [[ -d "${WORKSPACE}/${SRW_PLATFORM}" ]]; then
     workspace="${WORKSPACE}/${SRW_PLATFORM}"
 else
     workspace="$(cd -- "${script_dir}/../.." && pwd)"
@@ -26,20 +26,20 @@ else
 fi
 
 # Test directories
-we2e_experiment_base_dir="${workspace}/expt_dirs"
-we2e_test_dir="${workspace}/tests/WE2E"
+export we2e_experiment_base_dir="${workspace}/expt_dirs"
+export we2e_test_dir="${workspace}/tests/WE2E"
 
 # Clean any stale test logs
 rm -f ${workspace}/tests/WE2E/log.*
 rm -f ${we2e_experiment_base_dir}/*/log.generate_FV3LAM_wflow ${we2e_experiment_base_dir}/*/log/* WE2E_summary*txt
 
 # Run the end-to-end tests.
 if "${SRW_WE2E_COMPREHENSIVE_TESTS}"; then
-    test_type="comprehensive"
+    export test_type="comprehensive"
 else
-    test_type=${SRW_WE2E_SINGLE_TEST:-"coverage"}
+    export test_type=${SRW_WE2E_SINGLE_TEST:-"coverage"}
     if [[ "${test_type}" = skill-score ]]; then
-        test_type="grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_WoFS_v0"
+        export test_type="grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_WoFS_v0"
     fi
 fi
 
@@ -48,7 +48,8 @@ cd ${we2e_test_dir}
 progress_file="${workspace}/we2e_test_results-${platform}-${SRW_COMPILER}.txt"
 /usr/bin/time -p -f '{\n  "cpu": "%P"\n, "memMax": "%M"\n, "mem": {"text": "%X", "data": "%D", "swaps": "%W", "context": "%c", "waits": "%w"}\n, "pagefaults": {"major": "%F", "minor": "%R"}\n, "filesystem": {"inputs": "%I", "outputs": "%O"}\n, "time": {"real": "%e", "user": "%U", "sys": "%S"}\n}' -o ${WORKSPACE}/${SRW_PLATFORM}-${SRW_COMPILER}-time-srw_test.json \
     ./setup_WE2E_tests.sh ${platform} ${SRW_PROJECT} ${SRW_COMPILER} ${test_type} \
-    --expt_basedir=${we2e_experiment_base_dir} | tee ${progress_file}
+    --expt_basedir=${we2e_experiment_base_dir} | tee ${progress_file}; \
+    [[ -f ${we2e_experiment_base_dir}/grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_WoFS_v0/log.generate_FV3LAM_wflow ]] && ${workspace}/.cicd/scripts/srw_metric.sh run_stat_anly
 
 # Set exit code to number of failures
 set +e