From a81272bbf52d3e8a40ed9afc540a537a3fac5ddf Mon Sep 17 00:00:00 2001 From: Alexey Lebedeff Date: Wed, 10 Aug 2016 14:41:40 +0300 Subject: [PATCH] Use new rabbitmqctl features for monitoring To stop wasting network bandwidth during health checks (e.g. list_queues in 3-node cluster with 10k queues costs on average 12 megabytes of traffic and 27k TCP packets). Features are disabled by default to preserve compatibility, but they SHOULD be enabled when following patches are present in currently used rabbitmq version: - https://github.com/rabbitmq/rabbitmq-server/pull/915 - https://github.com/rabbitmq/rabbitmq-server/pull/911 - https://github.com/rabbitmq/rabbitmq-server/pull/883 --- scripts/rabbitmq-server-ha.ocf | 88 ++++++++++++++++++++++++++++++---- 1 file changed, 80 insertions(+), 8 deletions(-) diff --git a/scripts/rabbitmq-server-ha.ocf b/scripts/rabbitmq-server-ha.ocf index cd07d0c1b0d3..2669737d2f60 100755 --- a/scripts/rabbitmq-server-ha.ocf +++ b/scripts/rabbitmq-server-ha.ocf @@ -47,6 +47,8 @@ OCF_RESKEY_use_fqdn_default=false OCF_RESKEY_fqdn_prefix_default="" OCF_RESKEY_max_rabbitmqctl_timeouts_default=3 OCF_RESKEY_policy_file_default="/usr/local/sbin/set_rabbitmq_policy" +OCF_RESKEY_rmq_feature_health_check_default=false +OCF_RESKEY_rmq_feature_local_list_queues_default=false : ${HA_LOGTAG="lrmd"} : ${HA_LOGFACILITY="daemon"} @@ -68,6 +70,8 @@ OCF_RESKEY_policy_file_default="/usr/local/sbin/set_rabbitmq_policy" : ${OCF_RESKEY_fqdn_prefix=${OCF_RESKEY_fqdn_prefix_default}} : ${OCF_RESKEY_max_rabbitmqctl_timeouts=${OCF_RESKEY_max_rabbitmqctl_timeouts_default}} : ${OCF_RESKEY_policy_file=${OCF_RESKEY_policy_file_default}} +: ${OCF_RESKEY_rmq_feature_health_check=${OCF_RESKEY_rmq_feature_health_check_default}} +: ${OCF_RESKEY_rmq_feature_local_list_queues=${OCF_RESKEY_rmq_feature_local_list_queues_default}} ####################################################################### @@ -298,6 +302,26 @@ A path to the shell script to setup RabbitMQ policies + + +Since rabbit 3.6.4 list_queues/list_channels-based monitoring should +be replaced with "node_health_check" command, as it creates no network +load at all. + +Use node_health_check for monitoring + + + + + +For rabbit version that implements --local flag for list_queues, this +can greatly reduce network overhead in cases when node is +stopped/demoted. + +Use --local option for list_queues + + + $EXTENDED_OCF_PARAMS @@ -1361,7 +1385,9 @@ check_timeouts() { local timeouts_attr_name=$2 local op_name=$3 - if [ $op_rc -ne 124 -a $op_rc -ne 137 ]; then + # 75 is EX_TEMPFAIL from sysexits, and is used by rabbitmqctl to signal about + # timeout. + if [ $op_rc -ne 124 -a $op_rc -ne 137 -a $op_rc -ne 75 ]; then ocf_run attrd_updater -p --name $timeouts_attr_name --update 0 return 0 fi @@ -1392,12 +1418,20 @@ check_timeouts() { } wait_sync() { - wait_time=$1 + local wait_time=${1:?} + local queues + local opt_arg="" - queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} list_queues name state" - su_rabbit_cmd -t "${wait_time}" "sh -c \"while ${queues} | grep -q 'syncing,'; \ - do sleep 2; done\"" - return $? + if [ "$OCF_RESKEY_rmq_feature_local_list_queues" = "true" ]; then + opt_arg="--local" + fi + + queues="${COMMAND_TIMEOUT} ${OCF_RESKEY_ctl} list_queues $opt_arg name state" + + su_rabbit_cmd -t "${wait_time}" "sh -c \"while ${queues} | grep -q 'syncing,'; \ + do sleep 2; done\"" + + return $? } get_monitor() { @@ -1537,7 +1571,45 @@ get_monitor() { return $rc fi - # Check if the rabbitmqctl control plane is alive. + node_health_check + rc=$? + + return $rc +} + +# Check if the rabbitmqctl control plane is alive. +node_health_check() { + if [ "$OCF_RESKEY_rmq_feature_health_check" = true ]; then + node_health_check_local + else + node_health_check_legacy + fi +} + +node_health_check_local() { + local LH="${LH} node_health_check_local():" + local rc + local rc_timeouts + + local timeout=$((TIMEOUT_ARG - 2)) # give node_health_check some time to handle timeout by itself + su_rabbit_cmd "${OCF_RESKEY_ctl} node_health_check -t $timeout" + rc=$? + + check_timeouts $rc "rabbit_node_health_check_timeouts" "node_health_check" + rc_timeouts=$? + + if [ "$rc_timeouts" -eq 2 ]; then + master_score 0 + return $OCF_ERR_GENERIC + elif [ $rc -ne 0 ]; then + ocf_log err "${LH} rabbitmqctl node_health_check exited with errors." + return $OCF_ERR_GENERIC + else + return $OCF_SUCCESS + fi +} + +node_health_check_legacy() { local rc_alive local timeout_alive su_rabbit_cmd "${OCF_RESKEY_ctl} list_channels 2>&1 > /dev/null" @@ -1704,7 +1776,7 @@ action_start() { return $OCF_SUCCESS fi - local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts" + local attrs_to_zero="rabbit_list_channels_timeouts rabbit_get_alarms_timeouts rabbit_list_queues_timeouts rabbit_cluster_status_timeouts rabbit_node_health_check_timeouts" local attr_name_to_reset for attr_name_to_reset in $attrs_to_zero; do ocf_update_private_attr $attr_name_to_reset 0