Skip to content

Commit

Permalink
Use new rabbitmqctl features for monitoring
Browse files Browse the repository at this point in the history
To stop wasting network bandwidth during health checks (e.g. list_queues
in 3-node cluster with 10k queues costs on average 12 megabytes of
traffic and 27k TCP packets).

Features are disabled by default to preserve compatibility, but they
SHOULD be enabled when following patches are present in currently used
rabbitmq version:
- #915
- #911
- #883
  • Loading branch information
Alexey Lebedeff committed Aug 10, 2016
1 parent ca4c0c0 commit a81272b
Showing 1 changed file with 80 additions and 8 deletions.
88 changes: 80 additions & 8 deletions scripts/rabbitmq-server-ha.ocf
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ OCF_RESKEY_use_fqdn_default=false
OCF_RESKEY_fqdn_prefix_default=""
OCF_RESKEY_max_rabbitmqctl_timeouts_default=3
OCF_RESKEY_policy_file_default="/usr/local/sbin/set_rabbitmq_policy"
OCF_RESKEY_rmq_feature_health_check_default=false
OCF_RESKEY_rmq_feature_local_list_queues_default=false

: ${HA_LOGTAG="lrmd"}
: ${HA_LOGFACILITY="daemon"}
Expand All @@ -68,6 +70,8 @@ OCF_RESKEY_policy_file_default="/usr/local/sbin/set_rabbitmq_policy"
: ${OCF_RESKEY_fqdn_prefix=${OCF_RESKEY_fqdn_prefix_default}}
: ${OCF_RESKEY_max_rabbitmqctl_timeouts=${OCF_RESKEY_max_rabbitmqctl_timeouts_default}}
: ${OCF_RESKEY_policy_file=${OCF_RESKEY_policy_file_default}}
: ${OCF_RESKEY_rmq_feature_health_check=${OCF_RESKEY_rmq_feature_health_check_default}}
: ${OCF_RESKEY_rmq_feature_local_list_queues=${OCF_RESKEY_rmq_feature_local_list_queues_default}}

#######################################################################

Expand Down Expand Up @@ -298,6 +302,26 @@ A path to the shell script to setup RabbitMQ policies
<content type="string" default="${OCF_RESKEY_policy_file_default}" />
</parameter>
<parameter name="rmq_feature_health_check" unique="0" required="0">
<longdesc lang="en">
Since rabbit 3.6.4 list_queues/list_channels-based monitoring should
be replaced with "node_health_check" command, as it creates no network
load at all.
</longdesc>
<shortdesc lang="en">Use node_health_check for monitoring</shortdesc>
<content type="string" default="${OCF_RESKEY_rmq_feature_health_check_default}" />
</parameter>
<parameter name="rmq_feature_local_list_queues" unique="0" required="0">
<longdesc lang="en">
For rabbit version that implements --local flag for list_queues, this
can greatly reduce network overhead in cases when node is
stopped/demoted.
</longdesc>
<shortdesc lang="en">Use --local option for list_queues</shortdesc>
<content type="string" default="${OCF_RESKEY_rmq_feature_local_list_queues_default}" />
</parameter>
$EXTENDED_OCF_PARAMS
</parameters>
Expand Down Expand Up @@ -1361,7 +1385,9 @@ check_timeouts() {
local timeouts_attr_name=$2
local op_name=$3

if [ $op_rc -ne 124 -a $op_rc -ne 137 ]; then
# 75 is EX_TEMPFAIL from sysexits, and is used by rabbitmqctl to signal about
# timeout.
if [ $op_rc -ne 124 -a $op_rc -ne 137 -a $op_rc -ne 75 ]; then
ocf_run attrd_updater -p --name $timeouts_attr_name --update 0
return 0
fi
Expand Down Expand Up @@ -1392,12 +1418,20 @@ check_timeouts() {
}

wait_sync() {
wait_time=$1
local wait_time=${1:?}
local queues
local opt_arg=""

queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} list_queues name state"
su_rabbit_cmd -t "${wait_time}" "sh -c \"while ${queues} | grep -q 'syncing,'; \
do sleep 2; done\""
return $?
if [ "$OCF_RESKEY_rmq_feature_local_list_queues" = "true" ]; then
opt_arg="--local"
fi

queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} list_queues $opt_arg name state"

su_rabbit_cmd -t "${wait_time}" "sh -c \"while ${queues} | grep -q 'syncing,'; \
do sleep 2; done\""

return $?
}

get_monitor() {
Expand Down Expand Up @@ -1537,7 +1571,45 @@ get_monitor() {
return $rc
fi

# Check if the rabbitmqctl control plane is alive.
node_health_check
rc=$?

return $rc
}

# Check if the rabbitmqctl control plane is alive.
node_health_check() {
if [ "$OCF_RESKEY_rmq_feature_health_check" = true ]; then
node_health_check_local
else
node_health_check_legacy
fi
}

node_health_check_local() {
local LH="${LH} node_health_check_local():"
local rc
local rc_timeouts

local timeout=$((TIMEOUT_ARG - 2)) # give node_health_check some time to handle timeout by itself
su_rabbit_cmd "${OCF_RESKEY_ctl} node_health_check -t $timeout"
rc=$?

check_timeouts $rc "rabbit_node_health_check_timeouts" "node_health_check"
rc_timeouts=$?

if [ "$rc_timeouts" -eq 2 ]; then
master_score 0
return $OCF_ERR_GENERIC
elif [ $rc -ne 0 ]; then
ocf_log err "${LH} rabbitmqctl node_health_check exited with errors."
return $OCF_ERR_GENERIC
else
return $OCF_SUCCESS
fi
}

node_health_check_legacy() {
local rc_alive
local timeout_alive
su_rabbit_cmd "${OCF_RESKEY_ctl} list_channels 2>&1 > /dev/null"
Expand Down Expand Up @@ -1704,7 +1776,7 @@ action_start() {
return $OCF_SUCCESS
fi

local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts"
local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts rabbit_node_health_check_timeouts"
local attr_name_to_reset
for attr_name_to_reset in $attrs_to_zero; do
ocf_update_private_attr $attr_name_to_reset 0
Expand Down

0 comments on commit a81272b

Please sign in to comment.