Skip to content

Commit

Permalink
[develop] Feature cicd scorecard metric (#1079)
Browse files Browse the repository at this point in the history
* Update CI/CD scripts to include skill-score metric output so that follow-on metrics collection can display it on metrics Dashboard.
* Update Jenkinsfile to fix post() section that calls follow-on metrics collection job so that it is only called once at the end, regardless if any platforms builds or tests fail independently.
* Update the Jenkinsfile to skip platform Nodes that appear to be offline, rather than put them in the launch queue. This also means we can re-add the NOAAcloud platforms to the list of possible Nodes to attempt. The will be skipped if they are not online.
* Update Jenkinsfile to include timeout limits on Build stage and Test stage, so they don't run forever.
* Update Jenkinsfile to allow seeing timestamps in the Jenkins console log.

---------

Co-authored-by: EdwardSnyder-NOAA <[email protected]>
  • Loading branch information
BruceKropp-Raytheon and EdwardSnyder-NOAA authored Apr 25, 2024
1 parent 744bf17 commit 527b242
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 34 deletions.
49 changes: 32 additions & 17 deletions .cicd/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ pipeline {
disableConcurrentBuilds()
overrideIndexTriggers(false)
skipDefaultCheckout(true)
timestamps()
timeout(time: 12, unit: 'HOURS')
}

parameters {
Expand Down Expand Up @@ -74,6 +76,11 @@ pipeline {
// Run on all platform/compiler combinations by default or build and test only on the platform(s) and
// compiler(s) specified by SRW_PLATFORM_FILTER and SRW_COMPILER_FILTER
when {
beforeAgent true
expression {
return nodesByLabel(env.SRW_PLATFORM).size() > 0
}

allOf {
anyOf {
expression { params.SRW_PLATFORM_FILTER == 'all' }
Expand Down Expand Up @@ -137,6 +144,7 @@ pipeline {
sh "STAGE_NAME=${env.STAGE_NAME} " + 'bash --login "${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/disk_usage.sh"'
}
}

post {
always {
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-time-srw_init.json", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
Expand All @@ -147,6 +155,10 @@ pipeline {

// Run the unified build script; if successful create a tarball of the build and upload to S3
stage('Build') {
options {
timeout(time: 4, unit: 'HOURS')
}

steps {
dir ("${env.SRW_PLATFORM}") {
echo "${env.STAGE_NAME} SRW (${env.SRW_COMPILER}) on ${env.SRW_PLATFORM} (using ${env.WORKSPACE}/${env.SRW_PLATFORM})"
Expand Down Expand Up @@ -183,6 +195,7 @@ pipeline {
environment {
TASK_DEPTH = "${env.SRW_WRAPPER_TASK_DEPTH}"
}

steps {
dir ("${env.SRW_PLATFORM}") {
echo "Running ${TASK_DEPTH} simple workflow script task tests on ${env.SRW_PLATFORM} (using ${env.WORKSPACE}/${env.SRW_PLATFORM})"
Expand All @@ -193,6 +206,10 @@ pipeline {

// Run the unified test script
stage('Test') {
options {
timeout(time: 8, unit: 'HOURS')
}

environment {
SRW_WE2E_EXPERIMENT_BASE_DIR = "${env.WORKSPACE}/${env.SRW_PLATFORM}/expt_dirs"
}
Expand Down Expand Up @@ -228,39 +245,37 @@ pipeline {
post {
success {
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/*_test_results-*-*.txt", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false], [bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/we2e_test_logs-${env.SRW_PLATFORM}-${env.SRW_COMPILER}.tgz", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/*-skill-score.txt", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
}
always {
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-time-srw_test.json", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-disk-usage${env.STAGE_NAME}.csv", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
// Remove the data sets from the experiments directory to conserve disk space
sh 'find "${SRW_WE2E_EXPERIMENT_BASE_DIR}" -regextype posix-extended -regex "^.*(orog|[0-9]{10})$" -type d | xargs rm -rf'
}
}
}

stage('Metrics') {
steps {
script {
CI_BRANCH_NAME=env.JOB_BASE_NAME.replace("%2F","%252F")
echo "Triggering job for branch ${CI_BRANCH_NAME}/${env.BUILD_NUMBER} ..."
build job: '/ufs-srweather-app/ufs-srw-metrics', parameters: [
string(name: 'CI_JOB_NAME', value: "ufs-srweather-app/metrics"),
string(name: 'CI_BUILD_NUMBER', value: "${CI_BRANCH_NAME}/${env.BUILD_NUMBER}")
], wait: false
}
}
}
}
}
}
}
// end of stages{}

// Uncomment the following block to re-enable PW clusters
/*
post {
always {
// Stop any Parallel Works clusters that were started during the pipeline execution
script {
// Trigger another job to collect all build statistics
CI_JOB_NAME=env.JOB_NAME.replace("/${env.JOB_BASE_NAME}","")
CI_BRANCH_NAME=env.JOB_BASE_NAME.replace("%2F","%252F")
echo "post: Triggering ufs-srweather-app/ufs-srw-metrics job for ${CI_JOB_NAME} on branch build ${CI_BRANCH_NAME}/${env.BUILD_NUMBER} ..."
build job: '/ufs-srweather-app/ufs-srw-metrics', parameters: [
string(name: 'CI_JOB_NAME', value: "${CI_JOB_NAME}"),
string(name: 'CI_BUILD_NUMBER', value: "${CI_BRANCH_NAME}/${env.BUILD_NUMBER}")
], wait: false

// Uncomment the following block to re-enable PW clusters
/*
// Stop any Parallel Works clusters that were started during the pipeline execution
// def pw_clusters = ['pclusternoaav2use1', 'azclusternoaav2eus1', 'gclusternoaav2usc1']
def pw_clusters = ['pclusternoaav2use1']
def clusters = []
Expand All @@ -279,8 +294,8 @@ pipeline {
// PW_CLUSTER_NAME parameter
build job: 'parallel-works-jenkins-client/stop-cluster', parameters: [string(name: 'PW_CLUSTER_NAME', value: clusters[i])]
}
*/
}
}
}
*/
}
20 changes: 10 additions & 10 deletions .cicd/scripts/srw_metric_example.sh → .cicd/scripts/srw_metric.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,17 @@ else
fi

# Test directories
we2e_experiment_base_dir="${workspace}/../expt_dirs/metric_test"
we2e_test_dir="${workspace}/tests/WE2E"
we2e_test_name="grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_WoFS_v0"
we2e_experiment_base_dir="${we2e_experiment_base_dir:=${workspace}/../expt_dirs/metric_test}"
we2e_test_dir="${we2e_test_dir:=${workspace}/tests/WE2E}"
we2e_test_name="${test_type:=grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_WoFS_v0}"

pwd

# Setup the build environment
declare srw_compiler
srw_compiler=${SRW_COMPILER}
source etc/lmod-setup.sh ${platform,,}
module use modulefiles
source ${workspace}/etc/lmod-setup.sh ${platform,,}
module use ${workspace}/modulefiles
module load build_${platform,,}_${srw_compiler}

# Build srw
Expand Down Expand Up @@ -99,7 +99,7 @@ if [[ ${RUN_STAT_ANLY_OPT} == true ]]; then
rm -rf ${workspace}/Indy-Severe-Weather/
# Check if metprd data exists locally otherwise get it from S3
TEST_EXTRN_MDL_SOURCE_BASEDIR=$(grep TEST_EXTRN_MDL_SOURCE_BASEDIR ${workspace}/ush/machine/${SRW_PLATFORM}.yaml | awk '{print $NF}')
if [[ ! -d $(dirname ${TEST_EXTRN_MDL_SOURCE_BASEDIR})/metprd/point_stat ]] ; then
if [[ -d $(dirname ${TEST_EXTRN_MDL_SOURCE_BASEDIR})/metprd/point_stat ]] ; then
mkdir -p Indy-Severe-Weather/metprd/point_stat
cp -rp $(dirname ${TEST_EXTRN_MDL_SOURCE_BASEDIR})/metprd/point_stat Indy-Severe-Weather/metprd
elif [[ -f Indy-Severe-Weather.tgz ]]; then
Expand All @@ -108,7 +108,7 @@ if [[ ${RUN_STAT_ANLY_OPT} == true ]]; then
wget https://noaa-ufs-srw-pds.s3.amazonaws.com/sample_cases/release-public-v2.1.0/Indy-Severe-Weather.tgz
tar xvfz Indy-Severe-Weather.tgz
fi
[[ -f skill-score.txt ]] && rm skill-score.txt
[[ -f ${platform,,}-${srw_compiler}-skill-score.txt ]] && rm ${platform,,}-${srw_compiler}-skill-score.txt
# Skill score index is computed over several terms that are defined in parm/metplus/STATAnalysisConfig_skill_score.
# It is computed by aggregating the output from earlier runs of the Point-Stat and/or Grid-Stat tools over one or more cases.
# In this example, skill score index is a weighted average of 4 skill scores of RMSE statistics for wind speed, dew point temperature,
Expand All @@ -126,15 +126,15 @@ if [[ ${RUN_STAT_ANLY_OPT} == true ]]; then
sed -i 's|--load("conda")|load("conda")|g' ${workspace}/modulefiles/tasks/${platform,,}/run_vx.local.lua
fi
# Run stat_analysis
stat_analysis -config parm/metplus/STATAnalysisConfig_skill_score -lookin ${workspace}/Indy-Severe-Weather/metprd/point_stat -v 2 -out skill-score.txt
stat_analysis -config parm/metplus/STATAnalysisConfig_skill_score -lookin ${workspace}/Indy-Severe-Weather/metprd/point_stat -v 2 -out ${platform,,}-${srw_compiler}-skill-score.txt

# check skill-score.txt
cat skill-score.txt
cat ${platform,,}-${srw_compiler}-skill-score.txt

# get skill-score (SS_INDEX) and check if it is significantly smaller than 1.0
# A value greater than 1.0 indicates that the forecast model outperforms the reference,
# while a value less than 1.0 indicates that the reference outperforms the forecast.
tmp_string=$( tail -2 skill-score.txt | head -1 )
tmp_string=$( tail -2 ${platform,,}-${srw_compiler}-skill-score.txt | head -1 )
SS_INDEX=$(echo $tmp_string | awk -F " " '{print $NF}')
echo "Skill Score: ${SS_INDEX}"
if [[ ${SS_INDEX} < "0.700" ]]; then
Expand Down
15 changes: 8 additions & 7 deletions .cicd/scripts/srw_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" > /dev/null 2>&1 && pwd)
# Get repository root from Jenkins WORKSPACE variable if set, otherwise, set
# relative to script directory.
declare workspace
if [[ -n "${WORKSPACE}/${SRW_PLATFORM}" ]]; then
if [[ -d "${WORKSPACE}/${SRW_PLATFORM}" ]]; then
workspace="${WORKSPACE}/${SRW_PLATFORM}"
else
workspace="$(cd -- "${script_dir}/../.." && pwd)"
Expand All @@ -26,20 +26,20 @@ else
fi

# Test directories
we2e_experiment_base_dir="${workspace}/expt_dirs"
we2e_test_dir="${workspace}/tests/WE2E"
export we2e_experiment_base_dir="${workspace}/expt_dirs"
export we2e_test_dir="${workspace}/tests/WE2E"

# Clean any stale test logs
rm -f ${workspace}/tests/WE2E/log.*
rm -f ${we2e_experiment_base_dir}/*/log.generate_FV3LAM_wflow ${we2e_experiment_base_dir}/*/log/* WE2E_summary*txt

# Run the end-to-end tests.
if "${SRW_WE2E_COMPREHENSIVE_TESTS}"; then
test_type="comprehensive"
export test_type="comprehensive"
else
test_type=${SRW_WE2E_SINGLE_TEST:-"coverage"}
export test_type=${SRW_WE2E_SINGLE_TEST:-"coverage"}
if [[ "${test_type}" = skill-score ]]; then
test_type="grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_WoFS_v0"
export test_type="grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_WoFS_v0"
fi
fi

Expand All @@ -48,7 +48,8 @@ cd ${we2e_test_dir}
progress_file="${workspace}/we2e_test_results-${platform}-${SRW_COMPILER}.txt"
/usr/bin/time -p -f '{\n "cpu": "%P"\n, "memMax": "%M"\n, "mem": {"text": "%X", "data": "%D", "swaps": "%W", "context": "%c", "waits": "%w"}\n, "pagefaults": {"major": "%F", "minor": "%R"}\n, "filesystem": {"inputs": "%I", "outputs": "%O"}\n, "time": {"real": "%e", "user": "%U", "sys": "%S"}\n}' -o ${WORKSPACE}/${SRW_PLATFORM}-${SRW_COMPILER}-time-srw_test.json \
./setup_WE2E_tests.sh ${platform} ${SRW_PROJECT} ${SRW_COMPILER} ${test_type} \
--expt_basedir=${we2e_experiment_base_dir} | tee ${progress_file}
--expt_basedir=${we2e_experiment_base_dir} | tee ${progress_file}; \
[[ -f ${we2e_experiment_base_dir}/grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_WoFS_v0/log.generate_FV3LAM_wflow ]] && ${workspace}/.cicd/scripts/srw_metric.sh run_stat_anly

# Set exit code to number of failures
set +e
Expand Down

0 comments on commit 527b242

Please sign in to comment.