diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index 9e07dc8490e0..03a641447198 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -47,6 +47,8 @@ OCF_RESKEY_use_fqdn_default=false OCF_RESKEY_fqdn_prefix_default="" OCF_RESKEY_max_rabbitmqctl_timeouts_default=3 OCF_RESKEY_policy_file_default="/usr/local/sbin/set_rabbitmq_policy" +OCF_RESKEY_rmq_feature_health_check_default=true +OCF_RESKEY_rmq_feature_local_list_queues_default=true : ${HA_LOGTAG="lrmd"} : ${HA_LOGFACILITY="daemon"} @@ -68,6 +70,8 @@ OCF_RESKEY_policy_file_default="/usr/local/sbin/set_rabbitmq_policy" : ${OCF_RESKEY_fqdn_prefix=${OCF_RESKEY_fqdn_prefix_default}} : ${OCF_RESKEY_max_rabbitmqctl_timeouts=${OCF_RESKEY_max_rabbitmqctl_timeouts_default}} : ${OCF_RESKEY_policy_file=${OCF_RESKEY_policy_file_default}} +: ${OCF_RESKEY_rmq_feature_health_check=${OCF_RESKEY_rmq_feature_health_check_default}} +: ${OCF_RESKEY_rmq_feature_local_list_queues=${OCF_RESKEY_rmq_feature_local_list_queues_default}} ####################################################################### @@ -298,6 +302,26 @@ A path to the shell script to setup RabbitMQ policies + + +Since rabbit 3.6.4 list_queues/list_channels-based monitoring should +be replaced with "node_health_check" command, as it creates no network +load at all. + +Use node_health_check for monitoring + + + + + +For rabbit version that implements --local flag for list_queues, this +can greatly reduce network overhead in cases when node is +stopped/demoted. + +Use --local option for list_queues + + + $EXTENDED_OCF_PARAMS @@ -1361,7 +1385,9 @@ check_timeouts() { local timeouts_attr_name=$2 local op_name=$3 - if [ $op_rc -ne 124 -a $op_rc -ne 137 ]; then + # 75 is EX_TEMPFAIL from sysexits, and is used by rabbitmqctl to signal about + # timeout. + if [ $op_rc -ne 124 -a $op_rc -ne 137 -a $op_rc -ne 75 ]; then ocf_run attrd_updater -p --name $timeouts_attr_name --update 0 return 0 fi @@ -1392,12 +1418,20 @@ check_timeouts() { } wait_sync() { - wait_time=$1 + local wait_time=$1 + local queues + local opt_arg="" - queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} list_queues name state" - su_rabbit_cmd -t "${wait_time}" "sh -c \"while ${queues} | grep -q 'syncing,'; \ - do sleep 2; done\"" - return $? + if [ "$OCF_RESKEY_rmq_feature_local_list_queues" = "true" ]; then + opt_arg="--local" + fi + + queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} list_queues $opt_arg name state" + + su_rabbit_cmd -t "${wait_time}" "sh -c \"while ${queues} | grep -q 'syncing,'; \ + do sleep 2; done\"" + + return $? } get_monitor() { @@ -1537,7 +1571,62 @@ get_monitor() { return $rc fi - # Check if the rabbitmqctl control plane is alive. + # rc can be SUCCESS or RUNNING_MASTER, don't touch it unless there + # is some error uncovered by node_health_check + if ! node_health_check; then + rc=$OCF_ERR_GENERIC + fi + + ocf_log info "${LH} get_monitor function ready to return ${rc}" + return $rc +} + +# Check if the rabbitmqctl control plane is alive. +node_health_check() { + local rc + if [ "$OCF_RESKEY_rmq_feature_health_check" = true ]; then + node_health_check_local + rc=$? + else + node_health_check_legacy + rc=$? + fi + return $rc +} + +node_health_check_local() { + local LH="${LH} node_health_check_local():" + local rc + local rc_timeouts + + # Give node_health_check some time to handle timeout by itself. + # By using internal rabbitmqctl timeouts, we allow it to print + # more useful diagnostics + local timeout=$((TIMEOUT_ARG - 2)) + su_rabbit_cmd "${OCF_RESKEY_ctl} node_health_check -t $timeout" + rc=$? + + check_timeouts $rc "rabbit_node_health_check_timeouts" "node_health_check" + rc_timeouts=$? + + if [ "$rc_timeouts" -eq 2 ]; then + master_score 0 + ocf_log info "${LH} node_health_check timed out, retry limit reached" + return $OCF_ERR_GENERIC + elif [ "$rc_timeouts" -eq 1 ]; then + ocf_log info "${LH} node_health_check timed out, going to retry" + return $OCF_ERR_GENERIC + fi + + if [ "$rc" -ne 0 ]; then + ocf_log err "${LH} rabbitmqctl node_health_check exited with errors." + return $OCF_ERR_GENERIC + else + return $OCF_SUCCESS + fi +} + +node_health_check_legacy() { local rc_alive local timeout_alive su_rabbit_cmd "${OCF_RESKEY_ctl} list_channels 2>&1 > /dev/null" @@ -1630,7 +1719,6 @@ get_monitor() { fi fi - ocf_log info "${LH} get_monitor function ready to return ${rc}" return $rc } @@ -1704,7 +1792,7 @@ action_start() { return $OCF_SUCCESS fi - local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts" + local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts rabbit_node_health_check_timeouts" local attr_name_to_reset for attr_name_to_reset in $attrs_to_zero; do ocf_update_private_attr $attr_name_to_reset 0