Skip to content

Commit

Permalink
upgrade testing: make script error handling more robust
Browse files Browse the repository at this point in the history
  • Loading branch information
tgross committed Feb 19, 2025
1 parent 86e1d6d commit d0a749f
Show file tree
Hide file tree
Showing 13 changed files with 247 additions and 195 deletions.
73 changes: 44 additions & 29 deletions enos/enos-scenario-upgrade.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,14 @@ scenario "upgrade" {

module = module.test_cluster_health
variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# connecting to the Nomad API
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token

# configuring assertions
server_count = var.server_count
client_count = local.clients_count
jobs_count = step.run_initial_workloads.jobs_count
Expand Down Expand Up @@ -178,11 +181,14 @@ scenario "upgrade" {
]

variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# connecting to the Nomad API
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token

# driving the upgrade
servers = step.provision_cluster.servers
ssh_key_path = step.provision_cluster.ssh_key_file
artifactory_username = var.artifactory_username
Expand All @@ -202,11 +208,14 @@ scenario "upgrade" {

module = module.test_cluster_health
variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# connecting to the Nomad API
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token

# configuring assertions
server_count = var.server_count
client_count = local.clients_count
jobs_count = step.run_initial_workloads.jobs_count
Expand Down Expand Up @@ -251,14 +260,14 @@ scenario "upgrade" {
depends_on = [step.server_upgrade_test_cluster_health]

description = <<-EOF
Takes the clients one by one, writes some dynamic metadata to them,
Takes the clients one by one, writes some dynamic metadata to them,
updates the binary with the new one previously fetched and restarts them.
Important: The path where the binary will be placed is hardcoded to match
Important: The path where the binary will be placed is hardcoded to match
what the provision-cluster module does. It can be configurable in the future
but for now it is:
* "C:/opt/nomad.exe" for windows
* "C:/opt/nomad.exe" for windows
* "/usr/local/bin/nomad" for linux
To ensure the clients are upgraded one by one, they use the depends_on meta,
Expand All @@ -274,11 +283,14 @@ scenario "upgrade" {
]

variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# connecting to the Nomad API
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token

# configuring assertions
clients = step.provision_cluster.clients
ssh_key_path = step.provision_cluster.ssh_key_file
artifactory_username = var.artifactory_username
Expand All @@ -292,17 +304,20 @@ scenario "upgrade" {
depends_on = [step.upgrade_clients]

description = <<-EOF
Verify the health of the cluster by checking the status of all servers, nodes,
Verify the health of the cluster by checking the status of all servers, nodes,
jobs and allocs and stopping random allocs to check for correct reschedules"
EOF

module = module.test_cluster_health
variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# connecting to the Nomad API
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token

# configuring assertions
server_count = var.server_count
client_count = local.clients_count
jobs_count = step.run_initial_workloads.jobs_count
Expand Down
16 changes: 3 additions & 13 deletions enos/modules/fetch_artifactory/scripts/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,15 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1

set -xeuo pipefail
set -euo pipefail

wget --header="Authorization: Bearer $TOKEN" -O "$LOCAL_ZIP" "$URL"

if [ $? -eq 0 ]; then
echo "File downloaded successfully: $LOCAL_ZIP"
else
echo "Error downloading file." >&2
exit 1
fi
echo "File downloaded to $LOCAL_ZIP"

mkdir -p "$BINARY_PATH"
unzip -o "$LOCAL_ZIP" -d "$BINARY_PATH"

if [ $? -eq 0 ]; then
echo "File unzipped successfully to $BINARY_PATH"
else
echo "Error unzipping file." >&2
exit 1
fi
echo "File unzipped to $BINARY_PATH"

rm "$LOCAL_ZIP"
2 changes: 1 addition & 1 deletion enos/modules/run_workloads/scripts/wait_for_nomad_api.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1

set -xeuo pipefail
set -euo pipefail

TIMEOUT=10
INTERVAL=2
Expand Down
65 changes: 33 additions & 32 deletions enos/modules/test_cluster_health/scripts/allocs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,38 +5,43 @@
set -euo pipefail

error_exit() {
printf 'Error: %s' "${1}"
printf 'Error: %s' "${1}"
exit 1
}

MAX_WAIT_TIME=40
MAX_WAIT_TIME=120
POLL_INTERVAL=2

elapsed_time=0

# Quality: nomad_allocs_status: A GET call to /v1/allocs returns the correct number of allocations and they are all running

while true; do
allocs=$(nomad alloc status -json)
if [ $? -ne 0 ]; then
error_exit "Error running 'nomad alloc status': $allocs"
fi
running_allocs=
allocs_length=

running_allocs=$(echo $allocs | jq '[.[] | select(.ClientStatus == "running")]')
allocs_length=$(echo $running_allocs | jq 'length')
if [ -z "$allocs_length" ]; then
error_exit "No allocs found"
fi
checkAllocsCount() {
local allocs
allocs=$(nomad alloc status -json) || error_exit "Failed to check alloc status"

running_allocs=$(echo "$allocs" | jq '[.[] | select(.ClientStatus == "running")]')
allocs_length=$(echo "$running_allocs" | jq 'length') \
|| error_exit "Invalid alloc status -json output"

if [ "$allocs_length" -eq "$ALLOC_COUNT" ]; then
break
return 0
fi

return 1
}

while true; do
checkAllocsCount && break

if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')" error_exit "Unexpected number of ready clients: $clients_length"
error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')"
fi

echo "Running allocs: $$running_allocs, expected "$ALLOC_COUNT". Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."
echo "Running allocs: $running_allocs, expected $ALLOC_COUNT. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."
sleep $POLL_INTERVAL
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
Expand All @@ -48,19 +53,16 @@ echo "All ALLOCS are running."
random_index=$((RANDOM % allocs_length))
random_alloc_id=$(echo "$running_allocs" | jq -r ".[${random_index}].ID")

error_ms=$(nomad alloc stop "$random_alloc_id" 2>&1)
if [ $? -ne 0 ]; then
error_exit "Failed to stop allocation $random_alloc_id. Error: $error_msg"
fi
nomad alloc stop "$random_alloc_id" \
|| error_exit "Failed to stop allocation $random_alloc_id"

echo "Waiting for allocation $random_alloc_id to reach 'complete' status..."
elapsed_time=0

while true; do
alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus')

if [ "$alloc_status" == "complete" ]; then
break
alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus')
if [ "$alloc_status" == "complete" ]; then
break
fi

if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
Expand All @@ -76,18 +78,17 @@ echo "Waiting for all the allocations to be running again"
elapsed_time=0

while true; do
new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]')
running_new_allocs=$(echo "$new_allocs" | jq 'length')

if [ "$running_new_allocs" == "$ALLOC_COUNT" ]; then
break
fi

# reset
running_allocs=
allocs_length=

checkAllocsCount && break

if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Expected $ALLOC_COUNT running allocations, found $running_new_allocs after $elapsed_time seconds"
error_exit "Expected $ALLOC_COUNT running allocations, found $running_allocs after $elapsed_time seconds"
fi

echo "Expected $ALLOC_COUNT running allocations, found $running_new_allocs Retrying in $POLL_INTERVAL seconds..."
echo "Expected $ALLOC_COUNT running allocations, found $running_allocs Retrying in $POLL_INTERVAL seconds..."
sleep $POLL_INTERVAL
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
Expand Down
47 changes: 29 additions & 18 deletions enos/modules/test_cluster_health/scripts/clients.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
set -euo pipefail

error_exit() {
printf 'Error: %s' "${1}"
printf 'Error: %s' "${1}"
exit 1
}

Expand All @@ -15,32 +15,43 @@ MAX_WAIT_TIME=20 # Maximum wait time in seconds
POLL_INTERVAL=2 # Interval between status checks

elapsed_time=0
ready_clients=
last_error=

while true; do
clients_length=$(nomad node status -json | jq '[.[] | select(.Status == "ready")] | length')
checkReadyClients() {
local clients_length

ready_clients=$(nomad node status -json | jq '[.[] | select(.Status == "ready")]') ||
error_exit "Could not query node status"

clients_length=$(echo "$ready_clients" | jq 'length')
if [ "$clients_length" -eq "$CLIENT_COUNT" ]; then
break
last_error=
return 0
fi

if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Unexpected number of ready clients: $clients_length"
fi
last_error="Unexpected number of ready clients: $clients_length"
return 1
}

sleep "$POLL_INTERVAL"
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
checkEligibleClients() {
echo "$ready_clients" | jq -e '
map(select(.SchedulingEligibility != "eligible")) | length == 0' && return 0

clients=$(nomad node status -json)
running_clients=$(echo "$clients" | jq '[.[] | select(.Status == "ready")]')
last_error=$(echo "$ready_clients" | jq -r '
map(select(.SchedulingEligibility != "eligible")) | "\(.[].ID) is ineligible"')
return 1
}

echo "$running_clients" | jq -c '.[]' | while read -r node; do
status=$(echo "$node" | jq -r '.Status')
eligibility=$(echo "$node" | jq -r '.SchedulingEligibility')
while true; do
checkReadyClients && checkEligibleClients && break

if [ "$eligibility" != "eligible" ]; then
error_exit "Client $(echo "$node" | jq -r '.Name') is not eligible!"
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "$last_error"
fi

sleep "$POLL_INTERVAL"
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done

echo "All CLIENTS are eligible and running."
echo "All clients are eligible and running."
2 changes: 1 addition & 1 deletion enos/modules/test_cluster_health/scripts/jobs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
set -euo pipefail

error_exit() {
printf 'Error: %s' "${1}"
printf 'Error: %s' "${1}"
exit 1
}

Expand Down
Loading

0 comments on commit d0a749f

Please sign in to comment.