upgrade testing: make script error handling more robust

hashicorp · Feb 19, 2025 · d0a749f · d0a749f
1 parent 86e1d6d
commit d0a749f
Show file tree

Hide file tree

Showing 13 changed files with 247 additions and 195 deletions.
diff --git a/enos/enos-scenario-upgrade.hcl b/enos/enos-scenario-upgrade.hcl
@@ -108,11 +108,14 @@ scenario "upgrade" {
 
     module = module.test_cluster_health
     variables {
-      nomad_addr      = step.provision_cluster.nomad_addr
-      ca_file         = step.provision_cluster.ca_file
-      cert_file       = step.provision_cluster.cert_file
-      key_file        = step.provision_cluster.key_file
-      nomad_token     = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # configuring assertions
       server_count    = var.server_count
       client_count    = local.clients_count
       jobs_count      = step.run_initial_workloads.jobs_count
@@ -178,11 +181,14 @@ scenario "upgrade" {
     ]
 
     variables {
-      nomad_addr           = step.provision_cluster.nomad_addr
-      ca_file              = step.provision_cluster.ca_file
-      cert_file            = step.provision_cluster.cert_file
-      key_file             = step.provision_cluster.key_file
-      nomad_token          = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # driving the upgrade
       servers              = step.provision_cluster.servers
       ssh_key_path         = step.provision_cluster.ssh_key_file
       artifactory_username = var.artifactory_username
@@ -202,11 +208,14 @@ scenario "upgrade" {
 
     module = module.test_cluster_health
     variables {
-      nomad_addr      = step.provision_cluster.nomad_addr
-      ca_file         = step.provision_cluster.ca_file
-      cert_file       = step.provision_cluster.cert_file
-      key_file        = step.provision_cluster.key_file
-      nomad_token     = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # configuring assertions
       server_count    = var.server_count
       client_count    = local.clients_count
       jobs_count      = step.run_initial_workloads.jobs_count
@@ -251,14 +260,14 @@ scenario "upgrade" {
     depends_on = [step.server_upgrade_test_cluster_health]
 
     description = <<-EOF
-     Takes the clients one by one, writes some dynamic metadata to them, 
+     Takes the clients one by one, writes some dynamic metadata to them,
     updates the binary with the new one previously fetched and restarts them.
 
-    Important: The path where the binary will be placed is hardcoded to match 
+    Important: The path where the binary will be placed is hardcoded to match
     what the provision-cluster module does. It can be configurable in the future
     but for now it is:
 
-     * "C:/opt/nomad.exe" for windows 
+     * "C:/opt/nomad.exe" for windows
      * "/usr/local/bin/nomad" for linux
 
     To ensure the clients are upgraded one by one, they use the depends_on meta,
@@ -274,11 +283,14 @@ scenario "upgrade" {
     ]
 
     variables {
-      nomad_addr           = step.provision_cluster.nomad_addr
-      ca_file              = step.provision_cluster.ca_file
-      cert_file            = step.provision_cluster.cert_file
-      key_file             = step.provision_cluster.key_file
-      nomad_token          = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # configuring assertions
       clients              = step.provision_cluster.clients
       ssh_key_path         = step.provision_cluster.ssh_key_file
       artifactory_username = var.artifactory_username
@@ -292,17 +304,20 @@ scenario "upgrade" {
     depends_on = [step.upgrade_clients]
 
     description = <<-EOF
-    Verify the health of the cluster by checking the status of all servers, nodes, 
+    Verify the health of the cluster by checking the status of all servers, nodes,
     jobs and allocs and stopping random allocs to check for correct reschedules"
     EOF
 
     module = module.test_cluster_health
     variables {
-      nomad_addr      = step.provision_cluster.nomad_addr
-      ca_file         = step.provision_cluster.ca_file
-      cert_file       = step.provision_cluster.cert_file
-      key_file        = step.provision_cluster.key_file
-      nomad_token     = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # configuring assertions
       server_count    = var.server_count
       client_count    = local.clients_count
       jobs_count      = step.run_initial_workloads.jobs_count

diff --git a/enos/modules/fetch_artifactory/scripts/install.sh b/enos/modules/fetch_artifactory/scripts/install.sh
@@ -2,25 +2,15 @@
 # Copyright (c) HashiCorp, Inc.
 # SPDX-License-Identifier: BUSL-1.1
 
-set -xeuo pipefail
+set -euo pipefail
 
 wget --header="Authorization: Bearer $TOKEN" -O "$LOCAL_ZIP" "$URL"
 
-if [ $? -eq 0 ]; then
-    echo "File downloaded successfully: $LOCAL_ZIP"
-else
-    echo "Error downloading file." >&2
-    exit 1
-fi
+echo "File downloaded to $LOCAL_ZIP"
 
 mkdir -p "$BINARY_PATH"
 unzip -o "$LOCAL_ZIP" -d "$BINARY_PATH"
 
-if [ $? -eq 0 ]; then
-    echo "File unzipped successfully to $BINARY_PATH"
-else
-    echo "Error unzipping file." >&2
-    exit 1
-fi
+echo "File unzipped to $BINARY_PATH"
 
 rm "$LOCAL_ZIP"
diff --git a/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh b/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh
@@ -2,7 +2,7 @@
 # Copyright (c) HashiCorp, Inc.
 # SPDX-License-Identifier: BUSL-1.1
 
-set -xeuo pipefail
+set -euo pipefail
 
 TIMEOUT=10
 INTERVAL=2

diff --git a/enos/modules/test_cluster_health/scripts/allocs.sh b/enos/modules/test_cluster_health/scripts/allocs.sh
@@ -5,38 +5,43 @@
 set -euo pipefail
 
 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
     exit 1
 }
 
-MAX_WAIT_TIME=40
+MAX_WAIT_TIME=120
 POLL_INTERVAL=2
 
 elapsed_time=0
 
 # Quality: nomad_allocs_status: A GET call to /v1/allocs returns the correct number of allocations and they are all running
 
-while true; do    
-    allocs=$(nomad alloc status -json)
-    if [ $? -ne 0 ]; then
-        error_exit "Error running 'nomad alloc status': $allocs"
-    fi
+running_allocs=
+allocs_length=
 
-    running_allocs=$(echo $allocs | jq '[.[] | select(.ClientStatus == "running")]')
-    allocs_length=$(echo $running_allocs | jq 'length')
-    if [ -z "$allocs_length" ];  then
-        error_exit "No allocs found"
-    fi
+checkAllocsCount() {
+    local allocs
+    allocs=$(nomad alloc status -json) || error_exit "Failed to check alloc status"
+
+    running_allocs=$(echo "$allocs" | jq '[.[] | select(.ClientStatus == "running")]')
+    allocs_length=$(echo "$running_allocs" | jq 'length') \
+        || error_exit "Invalid alloc status -json output"
 
     if [ "$allocs_length" -eq "$ALLOC_COUNT" ]; then
-       break
+        return 0
     fi
 
+    return 1
+}
+
+while true; do
+    checkAllocsCount && break
+
     if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')"   error_exit "Unexpected number of ready clients: $clients_length"
+        error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')"
     fi
 
-    echo "Running allocs: $$running_allocs, expected "$ALLOC_COUNT". Waiting for $elapsed_time  Retrying in $POLL_INTERVAL seconds..."
+    echo "Running allocs: $running_allocs, expected $ALLOC_COUNT. Waiting for $elapsed_time  Retrying in $POLL_INTERVAL seconds..."
     sleep $POLL_INTERVAL
     elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done
@@ -48,19 +53,16 @@ echo "All ALLOCS are running."
 random_index=$((RANDOM % allocs_length))
 random_alloc_id=$(echo "$running_allocs" | jq -r ".[${random_index}].ID")
 
-error_ms=$(nomad alloc stop "$random_alloc_id" 2>&1)
-if [ $? -ne 0 ]; then
-    error_exit "Failed to stop allocation $random_alloc_id. Error: $error_msg"
-fi
+nomad alloc stop "$random_alloc_id" \
+    || error_exit "Failed to stop allocation $random_alloc_id"
 
 echo "Waiting for allocation $random_alloc_id to reach 'complete' status..."
 elapsed_time=0
 
 while true; do
-    alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus') 
-
-    if [ "$alloc_status" == "complete" ]; then 
-        break 
+    alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus')
+    if [ "$alloc_status" == "complete" ]; then
+        break
     fi
 
     if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
@@ -76,18 +78,17 @@ echo "Waiting for all the allocations to be running again"
 elapsed_time=0
 
 while true; do
-    new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]')
-    running_new_allocs=$(echo "$new_allocs" | jq 'length')
-
-    if [ "$running_new_allocs" == "$ALLOC_COUNT" ]; then
-        break
-    fi
-
+    # reset
+    running_allocs=
+    allocs_length=
+
+    checkAllocsCount && break
+
     if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Expected $ALLOC_COUNT running allocations, found $running_new_allocs after $elapsed_time seconds"
+        error_exit "Expected $ALLOC_COUNT running allocations, found $running_allocs after $elapsed_time seconds"
     fi
 
-    echo "Expected $ALLOC_COUNT running allocations, found $running_new_allocs Retrying in $POLL_INTERVAL seconds..."
+    echo "Expected $ALLOC_COUNT running allocations, found $running_allocs Retrying in $POLL_INTERVAL seconds..."
     sleep $POLL_INTERVAL
     elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done

diff --git a/enos/modules/test_cluster_health/scripts/clients.sh b/enos/modules/test_cluster_health/scripts/clients.sh
@@ -5,7 +5,7 @@
 set -euo pipefail
 
 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
     exit 1
 }
 
@@ -15,32 +15,43 @@ MAX_WAIT_TIME=20  # Maximum wait time in seconds
 POLL_INTERVAL=2   # Interval between status checks
 
 elapsed_time=0
+ready_clients=
+last_error=
 
-while true; do
-    clients_length=$(nomad node status -json | jq '[.[] | select(.Status == "ready")] | length')
+checkReadyClients() {
+    local clients_length
+
+    ready_clients=$(nomad node status -json | jq '[.[] | select(.Status == "ready")]') ||
+        error_exit "Could not query node status"
 
+    clients_length=$(echo "$ready_clients" | jq 'length')
     if [ "$clients_length" -eq "$CLIENT_COUNT" ]; then
-        break
+        last_error=
+        return 0
     fi
 
-    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Unexpected number of ready clients: $clients_length"
-    fi
+    last_error="Unexpected number of ready clients: $clients_length"
+    return 1
+}
 
-    sleep "$POLL_INTERVAL"
-    elapsed_time=$((elapsed_time + POLL_INTERVAL))
-done
+checkEligibleClients() {
+    echo "$ready_clients" | jq -e '
+        map(select(.SchedulingEligibility != "eligible")) | length == 0' && return 0
 
-clients=$(nomad node status -json)
-running_clients=$(echo "$clients" | jq '[.[] | select(.Status == "ready")]')
+    last_error=$(echo "$ready_clients" | jq -r '
+        map(select(.SchedulingEligibility != "eligible")) | "\(.[].ID) is ineligible"')
+    return 1
+}
 
-echo "$running_clients" | jq -c '.[]' | while read -r node; do
-    status=$(echo "$node" | jq -r '.Status')
-    eligibility=$(echo "$node" | jq -r '.SchedulingEligibility')
+while true; do
+    checkReadyClients && checkEligibleClients && break
 
-    if [ "$eligibility" != "eligible" ]; then
-        error_exit "Client $(echo "$node" | jq -r '.Name') is not eligible!"
+    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
+        error_exit "$last_error"
     fi
+
+    sleep "$POLL_INTERVAL"
+    elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done
 
-echo "All CLIENTS are eligible and running."
+echo "All clients are eligible and running."
diff --git a/enos/modules/test_cluster_health/scripts/jobs.sh b/enos/modules/test_cluster_health/scripts/jobs.sh
@@ -5,7 +5,7 @@
 set -euo pipefail
 
 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
     exit 1
 }