diff --git a/enos/enos-scenario-upgrade.hcl b/enos/enos-scenario-upgrade.hcl index a3be7ddaef3..28aa6fd2d6d 100644 --- a/enos/enos-scenario-upgrade.hcl +++ b/enos/enos-scenario-upgrade.hcl @@ -108,11 +108,14 @@ scenario "upgrade" { module = module.test_cluster_health variables { - nomad_addr = step.provision_cluster.nomad_addr - ca_file = step.provision_cluster.ca_file - cert_file = step.provision_cluster.cert_file - key_file = step.provision_cluster.key_file - nomad_token = step.provision_cluster.nomad_token + # connecting to the Nomad API + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + + # configuring assertions server_count = var.server_count client_count = local.clients_count jobs_count = step.run_initial_workloads.jobs_count @@ -178,11 +181,14 @@ scenario "upgrade" { ] variables { - nomad_addr = step.provision_cluster.nomad_addr - ca_file = step.provision_cluster.ca_file - cert_file = step.provision_cluster.cert_file - key_file = step.provision_cluster.key_file - nomad_token = step.provision_cluster.nomad_token + # connecting to the Nomad API + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + + # driving the upgrade servers = step.provision_cluster.servers ssh_key_path = step.provision_cluster.ssh_key_file artifactory_username = var.artifactory_username @@ -202,11 +208,14 @@ scenario "upgrade" { module = module.test_cluster_health variables { - nomad_addr = step.provision_cluster.nomad_addr - ca_file = step.provision_cluster.ca_file - cert_file = step.provision_cluster.cert_file - key_file = step.provision_cluster.key_file - nomad_token = step.provision_cluster.nomad_token + # connecting to the Nomad API + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + + # configuring assertions server_count = var.server_count client_count = local.clients_count jobs_count = step.run_initial_workloads.jobs_count @@ -251,14 +260,14 @@ scenario "upgrade" { depends_on = [step.server_upgrade_test_cluster_health] description = <<-EOF - Takes the clients one by one, writes some dynamic metadata to them, + Takes the clients one by one, writes some dynamic metadata to them, updates the binary with the new one previously fetched and restarts them. - Important: The path where the binary will be placed is hardcoded to match + Important: The path where the binary will be placed is hardcoded to match what the provision-cluster module does. It can be configurable in the future but for now it is: - * "C:/opt/nomad.exe" for windows + * "C:/opt/nomad.exe" for windows * "/usr/local/bin/nomad" for linux To ensure the clients are upgraded one by one, they use the depends_on meta, @@ -274,11 +283,14 @@ scenario "upgrade" { ] variables { - nomad_addr = step.provision_cluster.nomad_addr - ca_file = step.provision_cluster.ca_file - cert_file = step.provision_cluster.cert_file - key_file = step.provision_cluster.key_file - nomad_token = step.provision_cluster.nomad_token + # connecting to the Nomad API + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + + # configuring assertions clients = step.provision_cluster.clients ssh_key_path = step.provision_cluster.ssh_key_file artifactory_username = var.artifactory_username @@ -292,17 +304,20 @@ scenario "upgrade" { depends_on = [step.upgrade_clients] description = <<-EOF - Verify the health of the cluster by checking the status of all servers, nodes, + Verify the health of the cluster by checking the status of all servers, nodes, jobs and allocs and stopping random allocs to check for correct reschedules" EOF module = module.test_cluster_health variables { - nomad_addr = step.provision_cluster.nomad_addr - ca_file = step.provision_cluster.ca_file - cert_file = step.provision_cluster.cert_file - key_file = step.provision_cluster.key_file - nomad_token = step.provision_cluster.nomad_token + # connecting to the Nomad API + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + + # configuring assertions server_count = var.server_count client_count = local.clients_count jobs_count = step.run_initial_workloads.jobs_count diff --git a/enos/modules/fetch_artifactory/scripts/install.sh b/enos/modules/fetch_artifactory/scripts/install.sh index bf9249fad29..de49644e3c1 100755 --- a/enos/modules/fetch_artifactory/scripts/install.sh +++ b/enos/modules/fetch_artifactory/scripts/install.sh @@ -2,25 +2,15 @@ # Copyright (c) HashiCorp, Inc. # SPDX-License-Identifier: BUSL-1.1 -set -xeuo pipefail +set -euo pipefail wget --header="Authorization: Bearer $TOKEN" -O "$LOCAL_ZIP" "$URL" -if [ $? -eq 0 ]; then - echo "File downloaded successfully: $LOCAL_ZIP" -else - echo "Error downloading file." >&2 - exit 1 -fi +echo "File downloaded to $LOCAL_ZIP" mkdir -p "$BINARY_PATH" unzip -o "$LOCAL_ZIP" -d "$BINARY_PATH" -if [ $? -eq 0 ]; then - echo "File unzipped successfully to $BINARY_PATH" -else - echo "Error unzipping file." >&2 - exit 1 -fi +echo "File unzipped to $BINARY_PATH" rm "$LOCAL_ZIP" diff --git a/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh b/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh old mode 100644 new mode 100755 index 4e325446e09..cf38b0c6ab1 --- a/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh +++ b/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh @@ -2,7 +2,7 @@ # Copyright (c) HashiCorp, Inc. # SPDX-License-Identifier: BUSL-1.1 -set -xeuo pipefail +set -euo pipefail TIMEOUT=10 INTERVAL=2 diff --git a/enos/modules/test_cluster_health/scripts/allocs.sh b/enos/modules/test_cluster_health/scripts/allocs.sh index 41ad7b274fc..f8cc5abe5d9 100755 --- a/enos/modules/test_cluster_health/scripts/allocs.sh +++ b/enos/modules/test_cluster_health/scripts/allocs.sh @@ -5,38 +5,43 @@ set -euo pipefail error_exit() { - printf 'Error: %s' "${1}" + printf 'Error: %s' "${1}" exit 1 } -MAX_WAIT_TIME=40 +MAX_WAIT_TIME=120 POLL_INTERVAL=2 elapsed_time=0 # Quality: nomad_allocs_status: A GET call to /v1/allocs returns the correct number of allocations and they are all running -while true; do - allocs=$(nomad alloc status -json) - if [ $? -ne 0 ]; then - error_exit "Error running 'nomad alloc status': $allocs" - fi +running_allocs= +allocs_length= - running_allocs=$(echo $allocs | jq '[.[] | select(.ClientStatus == "running")]') - allocs_length=$(echo $running_allocs | jq 'length') - if [ -z "$allocs_length" ]; then - error_exit "No allocs found" - fi +checkAllocsCount() { + local allocs + allocs=$(nomad alloc status -json) || error_exit "Failed to check alloc status" + + running_allocs=$(echo "$allocs" | jq '[.[] | select(.ClientStatus == "running")]') + allocs_length=$(echo "$running_allocs" | jq 'length') \ + || error_exit "Invalid alloc status -json output" if [ "$allocs_length" -eq "$ALLOC_COUNT" ]; then - break + return 0 fi + return 1 +} + +while true; do + checkAllocsCount && break + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')" error_exit "Unexpected number of ready clients: $clients_length" + error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')" fi - echo "Running allocs: $$running_allocs, expected "$ALLOC_COUNT". Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..." + echo "Running allocs: $running_allocs, expected $ALLOC_COUNT. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..." sleep $POLL_INTERVAL elapsed_time=$((elapsed_time + POLL_INTERVAL)) done @@ -48,19 +53,16 @@ echo "All ALLOCS are running." random_index=$((RANDOM % allocs_length)) random_alloc_id=$(echo "$running_allocs" | jq -r ".[${random_index}].ID") -error_ms=$(nomad alloc stop "$random_alloc_id" 2>&1) -if [ $? -ne 0 ]; then - error_exit "Failed to stop allocation $random_alloc_id. Error: $error_msg" -fi +nomad alloc stop "$random_alloc_id" \ + || error_exit "Failed to stop allocation $random_alloc_id" echo "Waiting for allocation $random_alloc_id to reach 'complete' status..." elapsed_time=0 while true; do - alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus') - - if [ "$alloc_status" == "complete" ]; then - break + alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus') + if [ "$alloc_status" == "complete" ]; then + break fi if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then @@ -76,18 +78,17 @@ echo "Waiting for all the allocations to be running again" elapsed_time=0 while true; do - new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]') - running_new_allocs=$(echo "$new_allocs" | jq 'length') - - if [ "$running_new_allocs" == "$ALLOC_COUNT" ]; then - break - fi - + # reset + running_allocs= + allocs_length= + + checkAllocsCount && break + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Expected $ALLOC_COUNT running allocations, found $running_new_allocs after $elapsed_time seconds" + error_exit "Expected $ALLOC_COUNT running allocations, found $running_allocs after $elapsed_time seconds" fi - echo "Expected $ALLOC_COUNT running allocations, found $running_new_allocs Retrying in $POLL_INTERVAL seconds..." + echo "Expected $ALLOC_COUNT running allocations, found $running_allocs Retrying in $POLL_INTERVAL seconds..." sleep $POLL_INTERVAL elapsed_time=$((elapsed_time + POLL_INTERVAL)) done diff --git a/enos/modules/test_cluster_health/scripts/clients.sh b/enos/modules/test_cluster_health/scripts/clients.sh index 7895214dbfe..3a5e480ff70 100755 --- a/enos/modules/test_cluster_health/scripts/clients.sh +++ b/enos/modules/test_cluster_health/scripts/clients.sh @@ -5,7 +5,7 @@ set -euo pipefail error_exit() { - printf 'Error: %s' "${1}" + printf 'Error: %s' "${1}" exit 1 } @@ -15,32 +15,43 @@ MAX_WAIT_TIME=20 # Maximum wait time in seconds POLL_INTERVAL=2 # Interval between status checks elapsed_time=0 +ready_clients= +last_error= -while true; do - clients_length=$(nomad node status -json | jq '[.[] | select(.Status == "ready")] | length') +checkReadyClients() { + local clients_length + + ready_clients=$(nomad node status -json | jq '[.[] | select(.Status == "ready")]') || + error_exit "Could not query node status" + clients_length=$(echo "$ready_clients" | jq 'length') if [ "$clients_length" -eq "$CLIENT_COUNT" ]; then - break + last_error= + return 0 fi - if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Unexpected number of ready clients: $clients_length" - fi + last_error="Unexpected number of ready clients: $clients_length" + return 1 +} - sleep "$POLL_INTERVAL" - elapsed_time=$((elapsed_time + POLL_INTERVAL)) -done +checkEligibleClients() { + echo "$ready_clients" | jq -e ' + map(select(.SchedulingEligibility != "eligible")) | length == 0' && return 0 -clients=$(nomad node status -json) -running_clients=$(echo "$clients" | jq '[.[] | select(.Status == "ready")]') + last_error=$(echo "$ready_clients" | jq -r ' + map(select(.SchedulingEligibility != "eligible")) | "\(.[].ID) is ineligible"') + return 1 +} -echo "$running_clients" | jq -c '.[]' | while read -r node; do - status=$(echo "$node" | jq -r '.Status') - eligibility=$(echo "$node" | jq -r '.SchedulingEligibility') +while true; do + checkReadyClients && checkEligibleClients && break - if [ "$eligibility" != "eligible" ]; then - error_exit "Client $(echo "$node" | jq -r '.Name') is not eligible!" + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then + error_exit "$last_error" fi + + sleep "$POLL_INTERVAL" + elapsed_time=$((elapsed_time + POLL_INTERVAL)) done -echo "All CLIENTS are eligible and running." +echo "All clients are eligible and running." diff --git a/enos/modules/test_cluster_health/scripts/jobs.sh b/enos/modules/test_cluster_health/scripts/jobs.sh index c338b985d61..167a6650f1b 100755 --- a/enos/modules/test_cluster_health/scripts/jobs.sh +++ b/enos/modules/test_cluster_health/scripts/jobs.sh @@ -5,7 +5,7 @@ set -euo pipefail error_exit() { - printf 'Error: %s' "${1}" + printf 'Error: %s' "${1}" exit 1 } diff --git a/enos/modules/test_cluster_health/scripts/servers.sh b/enos/modules/test_cluster_health/scripts/servers.sh index 40756c0a0e7..39d6953897e 100755 --- a/enos/modules/test_cluster_health/scripts/servers.sh +++ b/enos/modules/test_cluster_health/scripts/servers.sh @@ -5,7 +5,7 @@ set -euo pipefail error_exit() { - printf 'Error: %s' "${1}" + printf 'Error: %s' "${1}" exit 1 } @@ -13,58 +13,80 @@ MAX_WAIT_TIME=40 POLL_INTERVAL=2 elapsed_time=0 +last_error= +leader_last_index= +leader_last_term= # Quality: nomad_agent_info: A GET call to /v1/agent/members returns the correct number of running servers and they are all alive -while true; do - servers=$(nomad operator autopilot health -json) - servers_healthy=$(echo "$servers" | jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length') +checkAutopilotHealth() { + local autopilotHealth servers_healthy leader + autopilotHealth=$(nomad operator autopilot health -json) || return 1 + servers_healthy=$(echo "$autopilotHealth" | + jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length') if [ "$servers_healthy" -eq 0 ]; then error_exit "No servers found." fi if [ "$servers_healthy" -eq "$SERVER_COUNT" ]; then - break + leader=$(echo "$autopilotHealth" | jq -r '.Servers[] | select(.Leader == true)') + leader_last_index=$(echo "$leader" | jq -r '.LastIndex') + leader_last_term=$(echo "$leader" | jq -r '.LastTerm') + return 0 fi + last_error="Expected $SERVER_COUNT healthy servers but have $servers_healthy" + return 1 +} + +while true; do + checkAutopilotHealth && break + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Unexpected number of healthy servers: $servers_healthy after $elapsed_time seconds." + error_exit "$last_error after $elapsed_time seconds." fi - echo "Servers found: $servers_healthy, expected: $SERVER_COUNT. Waiting for $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." + echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." sleep "$POLL_INTERVAL" elapsed_time=$((elapsed_time + POLL_INTERVAL)) done + # Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index as the leader" -# We use the leader's last log index to use as teh measure for the other servers. +# We use the leader's last log index to use as teh measure for the other servers. -leader=$(echo $servers | jq -r '.Servers[] | select(.Leader == true)') -leader_last_index=$(echo $leader | jq -r '.LastIndex') -leader_last_term=$(echo $leader | jq -r '.LastTerm') +checkServerHealth() { + local ip node_info + ip=$1 + echo "Checking server health for $ip" -for ip in $SERVERS; do -while true; do - node_info=$(nomad agent-info -address "https://$ip:4646" -json) - if [ $? -ne 0 ]; then - error_exit "Unable to get info for node at $ip" - fi + node_info=$(nomad agent-info -address "https://$ip:4646" -json) \ + || error_exit "Unable to get info for node at $ip" - last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') - last_leader_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term') + last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') + last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term') - if [ "$last_log_index" -ge "$leader_last_index" ] && [ "$last_leader_term" -ge "$leader_last_term" ]; then - break - fi + if [ "$last_log_index" -ge "$leader_last_index" ] && + [ "$last_log_term" -ge "$leader_last_term" ]; then + return 0 + fi + + last_error="Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_log_term" + return 1 +} + +for ip in $SERVERS; do + while true; do + checkServerHealth "$ip" && break if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_leader_term after $elapsed_time seconds." + error_exit "$last_error after $elapsed_time seconds." fi - echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..." + echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." sleep "$POLL_INTERVAL" elapsed_time=$((elapsed_time + POLL_INTERVAL)) - done + done done echo "All servers are alive and up to date." diff --git a/enos/modules/test_cluster_health/scripts/versions.sh b/enos/modules/test_cluster_health/scripts/versions.sh old mode 100644 new mode 100755 diff --git a/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh b/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh old mode 100644 new mode 100755 index 4e325446e09..cf38b0c6ab1 --- a/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh +++ b/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh @@ -2,7 +2,7 @@ # Copyright (c) HashiCorp, Inc. # SPDX-License-Identifier: BUSL-1.1 -set -xeuo pipefail +set -euo pipefail TIMEOUT=10 INTERVAL=2 diff --git a/enos/modules/upgrade_clients/scripts/set_metadata.sh b/enos/modules/upgrade_clients/scripts/set_metadata.sh old mode 100644 new mode 100755 index 77ed5a5770e..45fb65981fd --- a/enos/modules/upgrade_clients/scripts/set_metadata.sh +++ b/enos/modules/upgrade_clients/scripts/set_metadata.sh @@ -4,16 +4,15 @@ set -euo pipefail -client_id=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json | jq '.ID' | tr -d '"') -if [ -z "$client_id" ]; then - echo "No client found at $CLIENT_IP" - exit 1 +if ! client_id=$(nomad node status -address "http://$CLIENT_IP:4646" -self -json | jq '.ID' | tr -d '"'); then + echo "No client found at $CLIENT_IP" + exit 1 fi -nomad node meta apply -node-id $client_id node_ip="$CLIENT_IP" nomad_addr=$NOMAD_ADDR -if [ $? -nq 0 ]; then - echo "Failed to set metadata for node: $client_id at $CLIENT_IP" - exit 1 +if ! nomad node meta apply \ + -node-id "$client_id" node_ip="$CLIENT_IP" nomad_addr="$NOMAD_ADDR"; then + echo "Failed to set metadata for node: $client_id at $CLIENT_IP" + exit 1 fi echo "Metadata updated in $client_id at $CLIENT_IP" diff --git a/enos/modules/upgrade_clients/scripts/verify_metadata.sh b/enos/modules/upgrade_clients/scripts/verify_metadata.sh old mode 100644 new mode 100755 index 7bf8b86cc5d..898718b6960 --- a/enos/modules/upgrade_clients/scripts/verify_metadata.sh +++ b/enos/modules/upgrade_clients/scripts/verify_metadata.sh @@ -5,7 +5,7 @@ set -euo pipefail error_exit() { - printf 'Error: %s' "${1}" + printf 'Error: %s' "${1}" exit 1 } @@ -13,63 +13,55 @@ MAX_WAIT_TIME=10 # Maximum wait time in seconds POLL_INTERVAL=2 # Interval between status checks elapsed_time=0 +last_error= +client_id= -while true; do - if nomad node status -address "https://$CLIENT_IP:4646" -self &>/dev/null; then - exit 0 +checkClientReady() { + local client client_status + echo "Checking client health for $CLIENT_IP" + + client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) || + error_exit "Unable to get info for node at $CLIENT_IP" + + client_status=$(echo "$client" | jq -r '.Status') + if [ "$client_status" == "ready" ]; then + client_id=$(echo "$client" | jq '.ID' | tr -d '"') + last_error= + return 0 fi + last_error="Node at $CLIENT_IP is ${client_status}, not ready" + return 1 +} + +while true; do + checkClientReady && break if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Node at $NOMAD_ADDR did not become available within $elapsed_time seconds." + error_exit "$last_error within $elapsed_time seconds." exit 1 fi - echo "Node at $NOMAD_ADDR not available yet. Retrying in $POLL_INTERVAL seconds..." + echo "$last_error within $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." sleep "$POLL_INTERVAL" elapsed_time=$((elapsed_time + POLL_INTERVAL)) done -elapsed_time=0 - -while true; do - client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) - if [ -z "$client" ]; then - error_exit "No client found at $CLIENT_IP" - fi - - client_status=$(echo $client | jq -r '.Status') - if [ "$client_status" == "ready" ]; then - break - fi - - if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Client at $CLIENT_IP did not reach 'ready' status within $MAX_WAIT_TIME seconds." - - fi - - echo "Current status: $client_status, not 'ready'. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..." - sleep $POLL_INTERVAL - elapsed_time=$((elapsed_time + POLL_INTERVAL)) -done - # Quality: "nomad_node_metadata: A GET call to /v1/node/:node-id returns the same node.Meta for each node before and after a node upgrade" -client_id=$(echo $client | jq '.ID' | tr -d '"') -client_meta=$(nomad node meta read -json -node-id $client_id) -if [ $? -nq 0 ]; then - echo "Failed to read metadata for node: $client_id" - exit 1 +if ! client_meta=$(nomad node meta read -json -node-id "$client_id"); then + echo "Failed to read metadata for node: $client_id" + exit 1 fi -node_ip=$(echo $client_meta | jq -r '.Dynamic.node_ip' ) -if ["$node_ip" != "$CLIENT_IP" ]; then - echo "Wrong value returned for node_ip: $node_ip" +meta_node_ip=$(echo "$client_meta" | jq -r '.Dynamic.node_ip' ) +if [ "$meta_node_ip" != "$CLIENT_IP" ]; then + echo "Wrong value returned for node_ip: $meta_node_ip" exit 1 fi -nomad_addr=$(echo $client_meta | jq -r '.Dynamic.nomad_addr' ) -if ["$nomad_addr" != $NOMAD_ADDR ]; then - echo "Wrong value returned for nomad_addr: $nomad_addr" +meta_nomad_addr=$(echo "$client_meta" | jq -r '.Dynamic.nomad_addr' ) +if [ "$meta_nomad_addr" != "$NOMAD_ADDR" ]; then + echo "Wrong value returned for nomad_addr: $meta_nomad_addr" exit 1 fi diff --git a/enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh b/enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh old mode 100644 new mode 100755 diff --git a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh old mode 100644 new mode 100755 index f57021f5fdd..fbe93181aee --- a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh +++ b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh @@ -5,57 +5,79 @@ set -euo pipefail error_exit() { - printf 'Error: %s' "${1}" + printf 'Error: %s' "${1}" exit 1 } -MAX_WAIT_TIME=40 +MAX_WAIT_TIME=10 #40 POLL_INTERVAL=2 elapsed_time=0 +last_config_index= +last_error= -while true; do - servers=$(nomad operator api /v1/operator/raft/configuration) - leader=$(echo $servers | jq -r '[.Servers[] | select(.Leader == true)']) - echo $servers | jq '.' - echo $leader - if [ $(echo "$leader" | jq 'length') -eq 1 ]; then - break +checkRaftConfiguration() { + local raftConfig leader + raftConfig=$(nomad operator api /v1/operator/raft/configuration) || return 1 + leader=$(echo "$raftConfig" | jq -r '[.Servers[] | select(.Leader == true)']) + + echo "$raftConfig" | jq '.' + echo "$leader" + if [ "$(echo "$leader" | jq 'length')" -eq 1 ]; then + last_config_index=$(echo "$raftConfig" | jq -r '.Index') + echo "last_config_index: $last_config_index" + return 0 fi + last_error="No leader found" + return 1 +} + +while true; do + checkRaftConfiguration && break if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "No leader found after $elapsed_time seconds." + error_exit "${last_error} after $elapsed_time seconds." fi - echo "No leader found yet after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." + echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." sleep "$POLL_INTERVAL" elapsed_time=$((elapsed_time + POLL_INTERVAL)) done -last_config_index=$(echo $servers | jq -r '.Index') -echo "last_config_index: $last_config_index" -for ip in $SERVERS; do -while true; do - echo $ip - node_info=$(nomad agent-info -address "https://$ip:4646" -json) - if [ $? -ne 0 ]; then - error_exit "Unable to get info for node at $ip" - fi +# reset timer +elapsed_time=0 +last_log_index= - last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') - if [ "$last_log_index" -ge "$last_config_index" ]; then - break - fi +checkServerHealth() { + local ip node_info + ip=$1 + echo "Checking server health for $ip" + + node_info=$(nomad agent-info -address "https://$ip:4646" -json) \ + || error_exit "Unable to get info for node at $ip" + + last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') + if [ "$last_log_index" -ge "$last_config_index" ]; then + return 0 + fi + + last_error="Expected node at $ip to have last log index at least $last_config_index but found $last_log_index" + return 1 +} + +for ip in $SERVERS; do + while true; do + checkServerHealth "$ip" && break if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Expected node at $ip to have last log index at least $last_config_index but found $last_log_index after $elapsed_time seconds." + error_exit "$last_error after $elapsed_time seconds." fi - echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..." + echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." sleep "$POLL_INTERVAL" elapsed_time=$((elapsed_time + POLL_INTERVAL)) - done + done done echo "All servers are alive and up to date."